

In case you want to learn how ETL is done, please run the following notebook first and update the file name below accordingly

https://github.com/IBM/coursera/blob/master/coursera_ml/a2_w1_s3_ETL.ipynb


In [1]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

In [5]:
df_energy = spark.sql("""
select sqrt(sum(x*x)+sum(y*y)+sum(z*z)) as label, class from df group by class
""")      
df_energy.createOrReplaceTempView('df_energy') 

In [6]:
df_join = spark.sql('select * from df inner join df_energy on df.class=df_energy.class')

In [7]:
splits = df_join.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [8]:
df_train.count()

357812

In [9]:
df_test.count()

88717

In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer


vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)



In [11]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)


In [12]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, normalizer,lr])

model = pipeline.fit(df_train)

In [13]:
model.stages[2].summary.r2

0.03292647477426258

In [14]:
model = pipeline.fit(df_test)

In [15]:
model.stages[2].summary.r2

0.03126150920643611