

In case you want to learn how ETL is done, please run the following notebook first and update the file name below accordingly

https://github.com/IBM/coursera/blob/master/coursera_ml/a2_w1_s3_ETL.ipynb


In [2]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

--2019-12-27 22:34:17--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 140.82.118.3
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet [following]
--2019-12-27 22:34:18--  https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.48.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.48.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving to: 'hmp.parquet'


2019-12-27 22:34:18 (21.5 MB/s) - 'hmp.parquet' saved [932997/932997]



In [3]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [4]:
# Evaluating the pipeline steps
df_test.show()

+---+---+---+--------------------+--------------+
|  x|  y|  z|              source|         class|
+---+---+---+--------------------+--------------+
|  0| 10| 28|Accelerometer-201...|     Getup_bed|
|  0| 11| 38|Accelerometer-201...| Sitdown_chair|
|  0| 23| 36|Accelerometer-201...|   Brush_teeth|
|  0| 24| 35|Accelerometer-201...| Sitdown_chair|
|  0| 26| 15|Accelerometer-201...|  Climb_stairs|
|  0| 29| 38|Accelerometer-201...|   Brush_teeth|
|  0| 30| 34|Accelerometer-201...|     Getup_bed|
|  0| 30| 38|Accelerometer-201...|  Climb_stairs|
|  0| 31| 17|Accelerometer-201...| Standup_chair|
|  0| 31| 28|Accelerometer-201...|  Climb_stairs|
|  0| 31| 32|Accelerometer-201...| Standup_chair|
|  0| 31| 35|Accelerometer-201...|   Brush_teeth|
|  0| 32| 33|Accelerometer-201...|  Climb_stairs|
|  0| 32| 41|Accelerometer-201...|   Brush_teeth|
|  0| 33| 31|Accelerometer-201...|  Climb_stairs|
|  0| 33| 38|Accelerometer-201...|Descend_stairs|
|  0| 34| 31|Accelerometer-201...|   Brush_teeth|


In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer


indexer = StringIndexer(inputCol="class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")

normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)


 



In [9]:
# Evaluating the pipeline steps
# 1 - Indexing the class columm
indexed = indexer.fit(df_train).transform(df_train)
indexed.show()

+---+---+---+--------------------+--------------+-----+
|  x|  y|  z|              source|         class|label|
+---+---+---+--------------------+--------------+-----+
|  0| 12| 39|Accelerometer-201...| Sitdown_chair|  8.0|
|  0| 15| 39|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 16| 31|Accelerometer-201...|     Getup_bed|  1.0|
|  0| 17| 36|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 25| 30|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 25| 40|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 26| 42|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 27| 31|Accelerometer-201...| Sitdown_chair|  8.0|
|  0| 27| 33|Accelerometer-201...|     Getup_bed|  1.0|
|  0| 27| 37|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 27| 39|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 27| 41|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 28| 28|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 28| 48|Accelerometer-201...|   Brush_teeth|  6.0|
|  0| 29| 17|Accelerometer-201...|     Getup_bed

In [16]:
# Evaluating the pipeline steps
# 2 - VectorIndexing the class columm
vectorized = vectorAssembler.transform(indexed)
vectorized.show()

+---+---+---+--------------------+--------------+-----+---------------+
|  x|  y|  z|              source|         class|label|       features|
+---+---+---+--------------------+--------------+-----+---------------+
|  0| 12| 39|Accelerometer-201...| Sitdown_chair|  8.0|[0.0,12.0,39.0]|
|  0| 15| 39|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,15.0,39.0]|
|  0| 16| 31|Accelerometer-201...|     Getup_bed|  1.0|[0.0,16.0,31.0]|
|  0| 17| 36|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,17.0,36.0]|
|  0| 25| 30|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,25.0,30.0]|
|  0| 25| 40|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,25.0,40.0]|
|  0| 26| 42|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,26.0,42.0]|
|  0| 27| 31|Accelerometer-201...| Sitdown_chair|  8.0|[0.0,27.0,31.0]|
|  0| 27| 33|Accelerometer-201...|     Getup_bed|  1.0|[0.0,27.0,33.0]|
|  0| 27| 37|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,27.0,37.0]|
|  0| 27| 39|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,27.0

In [17]:
# Evaluating the pipeline steps
# 3 - Normalizing the features columm
normalized = normalizer.transform(vectorized)
normalized.show()

+---+---+---+--------------------+--------------+-----+---------------+--------------------+
|  x|  y|  z|              source|         class|label|       features|       features_norm|
+---+---+---+--------------------+--------------+-----+---------------+--------------------+
|  0| 12| 39|Accelerometer-201...| Sitdown_chair|  8.0|[0.0,12.0,39.0]|[0.0,0.2352941176...|
|  0| 15| 39|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,15.0,39.0]|[0.0,0.2777777777...|
|  0| 16| 31|Accelerometer-201...|     Getup_bed|  1.0|[0.0,16.0,31.0]|[0.0,0.3404255319...|
|  0| 17| 36|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,17.0,36.0]|[0.0,0.3207547169...|
|  0| 25| 30|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,25.0,30.0]|[0.0,0.4545454545...|
|  0| 25| 40|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,25.0,40.0]|[0.0,0.3846153846...|
|  0| 26| 42|Accelerometer-201...|   Brush_teeth|  6.0|[0.0,26.0,42.0]|[0.0,0.3823529411...|
|  0| 27| 31|Accelerometer-201...| Sitdown_chair|  8.0|[0.0,27.0,31.0]

In [18]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [19]:

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer,lr])


In [20]:
model = pipeline.fit(df_train)

In [21]:
prediction = model.transform(df_train)

In [22]:
prediction.printSchema()

root
 |-- x: integer (nullable = true)
 |-- y: integer (nullable = true)
 |-- z: integer (nullable = true)
 |-- source: string (nullable = true)
 |-- class: string (nullable = true)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- features_norm: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("label")
    
binEval.evaluate(prediction) 

0.20652146494726606

In [24]:
prediction = model.transform(df_test)

In [25]:
binEval.evaluate(prediction) 

0.20692460273239877