In [1]:
# delete files from previous runs
!rm -f hmp.parquet*

# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

# register a corresponding query table
df.createOrReplaceTempView('df')

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20200218061201-0000
KERNEL_ID = 1275b842-1671-4ff0-ba7f-f8c41c48c1b1
--2020-02-18 06:12:03--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet [following]
--2020-02-18 06:12:04--  https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving to: 'hmp.parquet'


2020-02-18 06:12:04 (22.3 MB/s) - 'hmp.parquet' saved [932997/932997]



In [3]:
df.show(5)

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
+---+---+---+--------------------+-----------+
only showing top 5 rows



In [2]:
# create the dataset splits: train and test

splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [5]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Normalizer
from pyspark.ml.linalg import Vectors

indexer = StringIndexer(inputCol='class', outputCol = 'label')

vectorAssembler = VectorAssembler(inputCols=['x', 'y', 'z'], outputCol='features')

normalizer = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)

In [8]:
from pyspark.ml.classification import LogisticRegression

In [10]:
# logisitic regression
logreg = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [11]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [indexer, vectorAssembler, logreg])

In [13]:
# on the training data first
model = pipeline.fit(df_train)

prediction = model.transform(df_train)

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# select the appropriate metric
eval = MulticlassClassificationEvaluator().setMetricName('accuracy').setLabelCol('label').setPredictionCol('prediction')

In [18]:
# accuracy for train data
eval.evaluate(prediction)

0.2066926981866701

In [19]:
# Now do the same for the test data

model = pipeline.fit(df_test)

prediction = model.transform(df_test)

In [20]:
# accuracy of test data
eval.evaluate(prediction)

0.20624401327660333