

In case you want to learn how ETL is done, please run the following notebook first and update the file name below accordingly

https://github.com/IBM/coursera/blob/master/coursera_ml/a2_w1_s3_ETL.ipynb


In [1]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20191228021521-0001
KERNEL_ID = 8186fd41-1319-4e21-9ddc-8f09de04f281
--2019-12-28 02:15:24--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet [following]
--2019-12-28 02:15:25--  https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving to: 'hmp.parquet'


2019-12-28 02:15:25 (24.4 MB/s) - 'hmp.parquet' saved [932997/932997]



In [2]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [3]:
df_train.show()

+---+---+---+--------------------+-------------+
|  x|  y|  z|              source|        class|
+---+---+---+--------------------+-------------+
|  0| 10| 28|Accelerometer-201...|    Getup_bed|
|  0| 12| 39|Accelerometer-201...|Sitdown_chair|
|  0| 15| 39|Accelerometer-201...|  Brush_teeth|
|  0| 16| 31|Accelerometer-201...|    Getup_bed|
|  0| 24| 35|Accelerometer-201...|Sitdown_chair|
|  0| 25| 30|Accelerometer-201...|  Brush_teeth|
|  0| 25| 40|Accelerometer-201...|  Brush_teeth|
|  0| 26| 15|Accelerometer-201...| Climb_stairs|
|  0| 26| 42|Accelerometer-201...|  Brush_teeth|
|  0| 27| 31|Accelerometer-201...|Sitdown_chair|
|  0| 27| 33|Accelerometer-201...|    Getup_bed|
|  0| 27| 37|Accelerometer-201...|  Brush_teeth|
|  0| 27| 39|Accelerometer-201...|  Brush_teeth|
|  0| 27| 41|Accelerometer-201...|  Brush_teeth|
|  0| 28| 28|Accelerometer-201...|  Brush_teeth|
|  0| 29| 17|Accelerometer-201...|    Getup_bed|
|  0| 29| 25|Accelerometer-201...|    Getup_bed|
|  0| 29| 25|Acceler

In [4]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer

indexer = StringIndexer(inputCol="class", outputCol="label")
encoder = OneHotEncoder(inputCol="label", outputCol="labelVec")
vectorAssembler = VectorAssembler(inputCols=["x","y","z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)



In [5]:
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1)

In [6]:
#Query to discover distinct class values 
df.createOrReplaceTempView('df')
test = spark.sql("SELECT DISTINCT class from df")
test.show()

+--------------+
|         class|
+--------------+
| Use_telephone|
| Standup_chair|
|      Eat_meat|
|     Getup_bed|
|   Drink_glass|
|    Pour_water|
|     Comb_hair|
|          Walk|
|  Climb_stairs|
| Sitdown_chair|
|   Liedown_bed|
|Descend_stairs|
|   Brush_teeth|
|      Eat_soup|
+--------------+



In [7]:
df.createOrReplaceTempView('df')
df_two_class = spark.sql("select * from df where class in ('Use_telephone','Standup_chair')")


In [8]:
splits = df_two_class.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [9]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer,lsvc])


In [10]:
model = pipeline.fit(df_train)



In [11]:
prediction = model.transform(df_train)

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(prediction)

0.9395406906222871

In [13]:
prediction = model.transform(df_test)

In [14]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(prediction)

0.9359381731178495