In [1]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate(SparkConf())
spark = SparkSession(sc).Builder().getOrCreate()
df = spark.read.parquet('./data/hmp.parquet')
df.createTempView('df')

In [2]:
splits =df.randomSplit([.8,.2])
df_train,df_test = splits[0],splits[1]

In [3]:
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler,Normalizer  
from pyspark.ml.linalg import Vectors

indexer = StringIndexer(inputCol='class',
                       outputCol='label')
vectorAssembler = VectorAssembler(inputCols=['x','y','z'],
                                 outputCol='features')
normalizer = Normalizer(inputCol='features',
                       outputCol='features_norm')

In [4]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import Pipeline
lr = LogisticRegression(maxIter=10,
                       regParam=.3,
                       elasticNetParam=.8)
pipeline = Pipeline(stages = [indexer,vectorAssembler,
                     normalizer,lr])
model = pipeline.fit(df_train)

In [5]:
prediction = model.transform(df_train)

In [6]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evalu = MulticlassClassificationEvaluator(metricName='accuracy',
                                         labelCol= 'label',
                                         predictionCol= 'prediction')  
evalu.evaluate(prediction)

0.20670028374906957

In [7]:
prediction = model.transform(df_test)
evalu.evaluate(prediction)

0.20621053930089378