In [1]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate(SparkConf())
spark = SparkSession(sc).Builder().getOrCreate()

In [2]:
df = spark.read.parquet('./data/hmp.parquet')

In [3]:
df.createTempView('df')

In [4]:
query = '''

select  SQRT(SUM(x*x) + SUM(y*y) + SUM(z*z) ) as label, class
from df group by class
'''
df_energy = spark.sql(query)
df_energy.createOrReplaceTempView('df_energy')

In [5]:
query = '''

SELECT * FROM df
INNER JOIN df_energy on df_energy.class =df.class
'''
df_join = spark.sql(query)
df_join.createOrReplaceTempView('df_join')

In [6]:
from pyspark.ml.feature import VectorAssembler,Normalizer

vectorAssembler = VectorAssembler(inputCols=['x','y','z'],
                                 outputCol='features')
normalizer = Normalizer(inputCol='features',
                        outputCol='features_norm',
                       p=1.0)

In [12]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.pipeline import Pipeline
lr = LinearRegression(maxIter=10,
                     regParam=.3,
                     elasticNetParam=.8
                     )

pipeline = Pipeline(stages=[vectorAssembler,normalizer,lr])
model = pipeline.fit(df_join)


In [13]:
predictions = model.transform(df_join)

In [14]:
model.stages[2].summary.r2

0.03259100556263628