Final Grade: 50/50

## Importing Packages

In [None]:
#pyspark packages
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
spark = SparkSession.builder.appName('spark-intro').getOrCreate()
import pyspark.sql.functions as f
from pyspark.sql.functions import array, col, explode, lit,when,to_timestamp,split
from pyspark.sql import DataFrame
from pyspark.ml.feature import StandardScaler,MinMaxScaler,VectorAssembler,StringIndexer
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier
from pyspark.ml import feature, evaluation,Pipeline,PipelineModel,classification
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#python packages
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=91c8de2a420b4a8f2121d12313bd4ed21744a15841a2225e029b260bb27afbce
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


## Reading Dataset

In [None]:
%%bash
# Do not change or modify this cell
# Need to install pyspark
# if pyspark is already installed, will print a message indicating pyspark already installed
pip install pyspark >& /dev/null 

# Download the data files from github
# If the data file does not exist in the colab environment
data_file_1=Watch_Data.csv

if [[ ! -f ./${data_file_1} ]]; then 
   # download the data file from github and save it in this colab environment instance
   wget https://raw.githubusercontent.com/chaitanyagithub201296/Activity_Recognition-IST718/main/${data_file_1} >& /dev/null 
fi


In [None]:
#reading dataset into pyspark dataframe
WatchData = spark.read.csv('/content/Watch_Data.csv',header=True, inferSchema= True)
print('Watch Data shape: ',(WatchData.count(), len(WatchData.columns)))

Watch Data shape:  (708604, 13)


## Data Summary

In [None]:
WatchData.show()

+-------+------+------------+-------------+------------+------------+------------+----+-------+---------+----------+------+-----------+
|    _c0| Index|Arrival_Time|Creation_Time|           x|           y|           z|User|  Model|   Device|        gt|Sensor|  bjerkmean|
+-------+------+------------+-------------+------------+------------+------------+----+-------+---------+----------+------+-----------+
| 512878|  9335|   1.4247E12|   2.11586E14|    8.565842|  -2.6647034|   1.6535492|   h|lgwatch|lgwatch_1|  stairsup|   Acc|0.355368205|
|1005537|  9474|   1.4247E12|   2.12086E14|  -0.3095703| 0.024017334|   0.1826477|   h|lgwatch|lgwatch_1|      bike|   GYR|0.201501982|
| 741419|  6618|  1.42469E12|    1.2128E14| -0.08958435|  0.03793335|-0.008865356|   c|lgwatch|lgwatch_2|      walk|   GYR|0.991062621|
| 997336| 60660|   1.4247E12|   1.20709E12|   1.3659353| -0.29827404| -0.32357407|   h|   gear|   gear_1|stairsdown|   GYR|0.321801398|
| 575284|252071|   1.4247E12|   2.09062E14|  -1.

In [None]:
WatchData.summary().show()

+-------+------------------+------------------+--------------------+--------------------+-------------------+------------------+------------------+------+-------+---------+------+------+-------------------+
|summary|               _c0|             Index|        Arrival_Time|       Creation_Time|                  x|                 y|                 z|  User|  Model|   Device|    gt|Sensor|          bjerkmean|
+-------+------------------+------------------+--------------------+--------------------+-------------------+------------------+------------------+------+-------+---------+------+------+-------------------+
|  count|            708604|            708604|              708604|              708604|             708604|            708604|            708604|708604| 708604|   708604|708604|708604|             708604|
|   mean|506303.33641356806|163191.13514036048|1.424752712742236...|2.230687083993444...|-0.2490253369574122|-1.316481994536018|1.0147995169201294|  null|   null|     null|

In [None]:
WatchData.dtypes

[('_c0', 'int'),
 ('Index', 'int'),
 ('Arrival_Time', 'double'),
 ('Creation_Time', 'double'),
 ('x', 'double'),
 ('y', 'double'),
 ('z', 'double'),
 ('User', 'string'),
 ('Model', 'string'),
 ('Device', 'string'),
 ('gt', 'string'),
 ('Sensor', 'string'),
 ('bjerkmean', 'double')]

## Data Manipulation

In [None]:
# creating labels for motion and static activities
WatchData = WatchData.withColumn(
    'gt_category',
    f.when((f.col("gt") == 'sit') | (f.col("gt") == 'stand') | (f.col("gt") == 'null'), 'static')\
    .otherwise('motion')
)

#grouping time of day
WatchData = WatchData.withColumn(
    'time_of_day',
    f.when(f.col("Arrival_time") < '12:00:00' ,'Morning').when((f.col("Arrival_time") > '12:00:00') & (f.col("Arrival_time") < '15:00:00'), 'Afternoon')\
    .otherwise('Evening')
)

#labels for activity
WatchData = WatchData.withColumn(
    'gt_label',
    f.when(f.col("gt") == 'stairsup' ,0).when(f.col("gt") == 'sit',1).when(f.col("gt") == 'stand',2).when(f.col("gt") == 'walk',3).when(f.col("gt") == 'bike',4).when(f.col("gt") == 'stairsdown',5)\
    .otherwise(6)
)

#time data type conversion
WatchData.withColumn('Arrival_Time',to_timestamp('Arrival_Time')).show(truncate=False)

WatchData = WatchData.withColumn("Arrival_Time",f.to_timestamp(WatchData['Arrival_Time']/1000))
WatchData = WatchData.withColumn("Creation_Time",f.to_timestamp(WatchData['Creation_Time']/1000))

WatchData = WatchData.withColumn('Arrival_Date', split(col('Arrival_Time'),' ').getItem(0))
WatchData = WatchData.withColumn('Arrival_time', split(col('Arrival_Time'),' ').getItem(1))

sampleARR = WatchData.withColumn('Creation_Date', split(col('Creation_Time'),' ').getItem(0))
WatchData = WatchData.withColumn('Creation_time', split(col('Creation_Time'),' ').getItem(1))

+-------+------+---------------------+-------------+------------+------------+------------+----+-------+---------+----------+------+-----------+-----------+-----------+--------+
|_c0    |Index |Arrival_Time         |Creation_Time|x           |y           |z           |User|Model  |Device   |gt        |Sensor|bjerkmean  |gt_category|time_of_day|gt_label|
+-------+------+---------------------+-------------+------------+------------+------------+----+-------+---------+----------+------+-----------+-----------+-----------+--------+
|512878 |9335  |+47116-12-12 08:00:00|2.11586E14   |8.565842    |-2.6647034  |1.6535492   |h   |lgwatch|lgwatch_1|stairsup  |Acc   |0.355368205|motion     |Evening    |0       |
|1005537|9474  |+47116-12-12 08:00:00|2.12086E14   |-0.3095703  |0.024017334 |0.1826477   |h   |lgwatch|lgwatch_1|bike      |GYR   |0.201501982|motion     |Evening    |4       |
|741419 |6618  |+47116-08-18 14:13:20|1.2128E14    |-0.08958435 |0.03793335  |-0.008865356|c   |lgwatch|lgwatc

## Data Visualization

In [None]:
#converting to pandas for visualisation purposes
watch_df = WatchData.toPandas()

In [None]:
#model type comparision chart
fig = px.histogram(WatchData.toPandas(), x="Model",title="Samsung Vs LG")
fig.show()

In [None]:
#activity type comparision for every user
fig = px.histogram(WatchData.toPandas(), x="User", color = 'gt',barmode='group',title='Activity type distribution by Users')
fig.show()

In [None]:
#sensor distribution
fig = px.histogram(watch_df, x="Sensor",color_discrete_sequence=px.colors.sequential.Viridis,title='Accelerometer Vs Gyroscope')
fig.show()

In [None]:
#Activity Distribution
fig = px.pie(watch_df,names='gt',color_discrete_sequence=px.colors.sequential.Viridis,title = 'Activity type distribution')
fig.update_layout(legend_title_text='Activity type')
fig.show()

In [None]:
#Motion. Static activity distribution per user
fig = px.histogram(WatchData.toPandas(), x="User",color = 'gt_category',barmode='group',title = 'Users by Category of activity').update_xaxes(categoryorder ='total descending')
fig.show()

In [None]:
#distribution comparision of motion and statics activites
fig = px.histogram(WatchData.toPandas(), x='gt_category',labels={
                     "gt_category": "Activity Category",},
                title="Activity category Distribution")
fig.show()

In [None]:
#Filtering based on sensors

WatchData_ACC = WatchData.filter(WatchData.Sensor == 'Acc')
WatchData_GYR = WatchData.filter(WatchData.Sensor == 'GYR')

#Accelarometer x-axis
sns.FacetGrid(WatchData_ACC.toPandas(), hue = 'gt', height = 6).map(sns.distplot,'x')
plt.legend(title='Activity type')
plt.title('X-axis values by Activity type')

In [None]:
#Accelarometer y-axis
sns.FacetGrid(WatchData_ACC.toPandas(), hue = 'gt', height = 6).map(sns.distplot,'y')
plt.legend(title='Activity type')
plt.title('Y-axis values by Activity type')

In [None]:
#Accelarometer z-axis
sns.FacetGrid(WatchData_ACC.toPandas(), hue = 'gt', size = 6).map(sns.distplot,'z')
plt.legend(title='Activity type')
plt.title('Z-axis values by Activity type')

In [None]:
#Gyrometer x-axis
sns.FacetGrid(WatchData_GYR.toPandas(), hue = 'gt', size = 6).map(sns.distplot,'x')
plt.legend(title='Activity type')
plt.title('X-axis values by Activity type')

In [None]:
#Gyrometer y-axis
sns.FacetGrid(WatchData_GYR.toPandas(), hue = 'gt', size = 6).map(sns.distplot,'y')
plt.legend(title='Activity type')
plt.title('Y-axis values by Activity type')

In [None]:
#Gyrometer z-axis
sns.FacetGrid(WatchData_GYR.toPandas(), hue = 'gt', size = 6).map(sns.distplot,'z')
plt.legend(title='Activity type')
plt.title('Z-axis values by Activity type')

## Grid Search 

In [None]:
grid_search = False

## Case 1 - Activity Prediciton

Basic classification model with 'gt' as target variable and device, gt_category, x, y, z, and jerkmean as input variables.

In [None]:
WatchData_Case1 = WatchData.select('x','y','z','Model','Sensor','bjerkmean','gt_category','time_of_day','gt_label')

In [None]:
#string indexing
Model_indexer = StringIndexer(inputCol='Model', outputCol='encoded_model').setHandleInvalid("keep")
category_indexer = StringIndexer(inputCol='gt_category', outputCol='encoded_category').setHandleInvalid("keep")
time_indexer = StringIndexer(inputCol='time_of_day', outputCol='encoded_time').setHandleInvalid("keep")
sensor_indexer = StringIndexer(inputCol='Sensor', outputCol='encoded_sensor').setHandleInvalid("keep")

#feature assembly and scaling
va_xyz = feature.VectorAssembler(inputCols= ['bjerkmean'], outputCol='v_all')
sc_all = feature.StandardScaler(inputCol = 'v_all',outputCol = 'scaled_features',withStd=True)
#train test split
train,validation,test = WatchData_Case1.randomSplit([0.7,0.15,0.15],seed = 718)
va = feature.VectorAssembler(inputCols=['encoded_category','encoded_sensor','x','y','z','scaled_features'], outputCol='assembled_features')

random forest

In [None]:
#random forest model with grid search
rf = RandomForestClassifier(labelCol="gt_label", featuresCol="assembled_features", numTrees=25)
main_pipe = Pipeline(stages = [category_indexer,time_indexer,sensor_indexer,va_xyz,sc_all,va,rf])
if grid_search == True:
  gridRF = ParamGridBuilder().\
  addGrid(main_pipe.getStages()[6].numTrees, [5,10,15,20,25]).build()
  all_models = []
  for j in range(len(gridRF)):
      #print("Fitting model {}".format(j+1))
      model = main_pipe.fit(train, gridRF[j])
      all_models.append(model)

  # estimate the accuracy of each of them:
  accuracies = [m.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).first().accuracy for m in all_models]
  best_model_idx = np.argmax(accuracies)
  #printing best model index and parameters
  print("best model index =", best_model_idx)
  print("best model index =", gridRF[best_model_idx])
  best_model = all_models[best_model_idx]
  best_model.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()
else:
  rf = RandomForestClassifier(labelCol="gt_label", featuresCol="assembled_features", numTrees=15)
  main_pipe = Pipeline(stages = [category_indexer,time_indexer,sensor_indexer,va_xyz,sc_all,va,rf])
  RFModel = main_pipe.fit(train)
  RFModel.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()



Logistic Regression

In [None]:
#logistic regression model with grid search
lr = LogisticRegression(labelCol="gt_label", featuresCol="assembled_features")
main_pipe3 = Pipeline(stages = [category_indexer,time_indexer,sensor_indexer,va_xyz,sc_all,va,lr])

if grid_search == True:
  gridLR = ParamGridBuilder().\
        addGrid(main_pipe3.getStages()[6].maxIter, [1, 5, 10]).\
        addGrid(main_pipe3.getStages()[6].regParam, [0., 0.01, 0.02, 0.03]).\
        addGrid(main_pipe3.getStages()[6].elasticNetParam, [0., 0.2, 0.4, 0.9]).build()
  all_models = []
  for j in range(len(gridLR)):
      #print("Fitting model {}".format(j+1))
      model = main_pipe3.fit(train, gridLR[j])
      all_models.append(model)
  accuracies = [m.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).first().accuracy for m in all_models]
  best_model_idx = np.argmax(accuracies)
  #printing best model index and parameters
  print("best model index =", best_model_idx)
  print("best model index =", gridLR[best_model_idx])
  best_model = all_models[best_model_idx]
  best_model.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()
else:
  lr = LogisticRegression(labelCol="gt_label", featuresCol="assembled_features",maxIter=5,regParam=0.01,elasticNetParam=0.4)
  main_pipe3 = Pipeline(stages = [category_indexer,time_indexer,sensor_indexer,va_xyz,sc_all,va,lr])
  LRModel = main_pipe.fit(train)
  LRModel.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()

## Case 2 - Activity Prediciton Based on User

In [None]:
#sampling for user data
WatchData_case2 = WatchData.sampleBy("Model", fractions={'lgwatch': 0.13, 'gear': 0.87}, seed=123)
WatchData_case2 = WatchData_case2.select('x','y','z','Sensor','bjerkmean','time_of_day','gt_label','user','gt_category')
WatchData_case2 = WatchData_case2[WatchData_case2['user'] == 'f']
WatchData_case2 = WatchData_case2[WatchData_case2['time_of_day'] == 'Evening']

In [None]:
#shape of the dataframe
print('WatchData_case2  shape: ',(WatchData_case2.count(), len(WatchData_case2.columns)))

In [None]:
# string indexing
category_indexer = StringIndexer(inputCol='gt_category', outputCol='encoded_category').setHandleInvalid("keep")
sensor_indexer = StringIndexer(inputCol='Sensor', outputCol='encoded_sensor').setHandleInvalid("keep")
#feature assembly for scaling
va_xyz = feature.VectorAssembler(inputCols= ['bjerkmean'], outputCol='v_all')
sc_all = feature.StandardScaler(inputCol = 'v_all',outputCol = 'scaled_features',withStd=True)
#train test split
train,validation,test = WatchData_case2.randomSplit([0.7,0.15,0.15],seed = 718)
va = feature.VectorAssembler(inputCols=['encoded_category','encoded_sensor','x','y','z','scaled_features'], outputCol='assembled_features')


In [None]:
#random forest classification
rf = RandomForestClassifier(labelCol="gt_label", featuresCol="assembled_features")
main_pipe = Pipeline(stages = [category_indexer,sensor_indexer,va_xyz,sc_all,va,rf])

if grid_search == True:
  gridRF = ParamGridBuilder().\
  addGrid(main_pipe.getStages()[5].numTrees, [5,10,15,20,25]).build()
  all_models = []
  for j in range(len(gridRF)):
      #print("Fitting model {}".format(j+1))
      model = main_pipe.fit(train, gridRF[j])
      all_models.append(model)
  # estimate the accuracy of each of them:
  accuracies = [m.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).first().accuracy for m in all_models]
  best_model_idx = np.argmax(accuracies)
  #printing best model index and parameters
  print("best model index =", best_model_idx)
  print("best model index =", gridRF[best_model_idx])
  best_model = all_models[best_model_idx]
  predictions = best_model.transform(test)
  predictions.select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()
else:
  rf = RandomForestClassifier(labelCol="gt_label", featuresCol="assembled_features", numTrees=10)
  main_pipe = Pipeline(stages =  [category_indexer,sensor_indexer,va_xyz,sc_all,va,rf])
  RFModel = main_pipe.fit(train)
  predictions = RFModel.transform(test)
  predictions.select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()

In [None]:
# Error Analysis
print('Label - Stairsup')
print('correct :',predictions[(predictions.gt_label == 0) & (predictions.prediction == 0)].count())
print('incorrect :',predictions[(predictions.gt_label == 0) & (predictions.prediction != 0)].count())
print('*******************')
print('Label - Sit')
print('correct :',predictions[(predictions.gt_label == 1) & (predictions.prediction == 1)].count())
print('incorrect :',predictions[(predictions.gt_label == 1) & (predictions.prediction != 1)].count())
print('*******************')
print('Label - Stand')
print('correct :',predictions[(predictions.gt_label == 2) & (predictions.prediction == 2)].count())
print('incorrect :',predictions[(predictions.gt_label == 2) & (predictions.prediction != 2)].count())
print('*******************')
print('Label - Walk')
print('correct :',predictions[(predictions.gt_label == 3) & (predictions.prediction == 3)].count())
print('incorrect :',predictions[(predictions.gt_label == 3) & (predictions.prediction != 3)].count())
print('*******************')
print('Label - Bike')
print('correct :',predictions[(predictions.gt_label == 4) & (predictions.prediction == 4)].count())
print('incorrect :',predictions[(predictions.gt_label == 4) & (predictions.prediction != 4)].count())
print('*******************')
print('Label - Stairsdown')
print('correct :',predictions[(predictions.gt_label == 5) & (predictions.prediction == 5)].count())
print('incorrect :',predictions[(predictions.gt_label == 5) & (predictions.prediction != 5)].count())
print('*******************')
print('Label - NULL')
print('correct :',predictions[(predictions.gt_label == 6) & (predictions.prediction == 6)].count())
print('incorrect :',predictions[(predictions.gt_label == 6) & (predictions.prediction != 6)].count())

## Case 3 - Model Comparision

In [None]:
WatchData_case3 = WatchData.select('x','y','z','Sensor','bjerkmean','time_of_day','gt_label','user','gt_category','Model')
WatchData_case3 = WatchData_case3.sampleBy("Model", fractions={'lgwatch': 0.13, 'gear': 0.87}, seed=123)
WatchData_case3_LG = WatchData_case3[WatchData_case3['Model'] == 'lgwatch']
WatchData_case3_Samsung = WatchData_case3[WatchData_case3['Model'] == 'gear']

In [None]:
print('WatchData_case3_LG  shape: ',(WatchData_case3_LG.count(), len(WatchData_case3_LG.columns)))
print('WatchData_case3_Samsung  shape: ',(WatchData_case3_Samsung.count(), len(WatchData_case3_Samsung.columns)))

LG watch

In [None]:
#string indexing categorical columns
category_indexer = StringIndexer(inputCol='gt_category', outputCol='encoded_category').setHandleInvalid("keep")
sensor_indexer = StringIndexer(inputCol='Sensor', outputCol='encoded_sensor').setHandleInvalid("keep")
#Feature assembly and standard scaling
va_xyz = feature.VectorAssembler(inputCols= ['bjerkmean'], outputCol='v_all')
sc_all = feature.StandardScaler(inputCol = 'v_all',outputCol = 'scaled_features',withStd=True)
#train test split
train,validation,test = WatchData_case3_LG.randomSplit([0.7,0.15,0.15],seed = 718)
va = feature.VectorAssembler(inputCols=['encoded_category','encoded_sensor','x','y','z','scaled_features'], outputCol='assembled_features')

In [None]:
#random forest
rf = RandomForestClassifier(labelCol="gt_label", featuresCol="assembled_features")
main_pipe = Pipeline(stages = [category_indexer,sensor_indexer,va_xyz,sc_all,va,rf])

if grid_search == True:
  gridRF = ParamGridBuilder().\
  addGrid(main_pipe.getStages()[5].numTrees, [5,10,15,20,25]).build()
  all_models = []
  for j in range(len(gridRF)):
      #print("Fitting model {}".format(j+1))
      model = main_pipe.fit(train, gridRF[j])
      all_models.append(model)
  # estimate the accuracy of each of them:
  accuracies = [m.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).first().accuracy for m in all_models]
  best_model_idx = np.argmax(accuracies)
  #printing best model index and parameters
  print("best model index =", best_model_idx)
  print("best model index =", gridRF[best_model_idx])
  best_model = all_models[best_model_idx]
  best_model.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()
else:
  rf = RandomForestClassifier(labelCol="gt_label", featuresCol="assembled_features", numTrees=20)
  main_pipe = Pipeline(stages = [category_indexer,sensor_indexer,va_xyz,sc_all,va,rf])
  RFModel = main_pipe.fit(train)
  predictions = RFModel.transform(test)
  predictions.select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()

In [None]:
# Error Analysis
print('Label - Stairsup')
print('correct :',predictions[(predictions.gt_label == 0) & (predictions.prediction == 0)].count())
print('incorrect :',predictions[(predictions.gt_label == 0) & (predictions.prediction != 0)].count())
print('*******************')
print('Label - Sit')
print('correct :',predictions[(predictions.gt_label == 1) & (predictions.prediction == 1)].count())
print('incorrect :',predictions[(predictions.gt_label == 1) & (predictions.prediction != 1)].count())
print('*******************')
print('Label - Stand')
print('correct :',predictions[(predictions.gt_label == 2) & (predictions.prediction == 2)].count())
print('incorrect :',predictions[(predictions.gt_label == 2) & (predictions.prediction != 2)].count())
print('*******************')
print('Label - Walk')
print('correct :',predictions[(predictions.gt_label == 3) & (predictions.prediction == 3)].count())
print('incorrect :',predictions[(predictions.gt_label == 3) & (predictions.prediction != 3)].count())
print('*******************')
print('Label - Bike')
print('correct :',predictions[(predictions.gt_label == 4) & (predictions.prediction == 4)].count())
print('incorrect :',predictions[(predictions.gt_label == 4) & (predictions.prediction != 4)].count())
print('*******************')
print('Label - Stairsdown')
print('correct :',predictions[(predictions.gt_label == 5) & (predictions.prediction == 5)].count())
print('incorrect :',predictions[(predictions.gt_label == 5) & (predictions.prediction != 5)].count())
print('*******************')
print('Label - NULL')
print('correct :',predictions[(predictions.gt_label == 6) & (predictions.prediction == 6)].count())
print('incorrect :',predictions[(predictions.gt_label == 6) & (predictions.prediction != 6)].count())

Gear

In [None]:
#string indexing categorical columns
category_indexer = StringIndexer(inputCol='gt_category', outputCol='encoded_category').setHandleInvalid("keep")
sensor_indexer = StringIndexer(inputCol='Sensor', outputCol='encoded_sensor').setHandleInvalid("keep")
#Feature assembly and standard scaling
va_xyz = feature.VectorAssembler(inputCols= ['bjerkmean'], outputCol='v_all')
sc_all = feature.StandardScaler(inputCol = 'v_all',outputCol = 'scaled_features',withStd=True)
#train test split
train,validation,test = WatchData_case3_Samsung.randomSplit([0.7,0.15,0.15],seed = 718)
va = feature.VectorAssembler(inputCols=['encoded_category','encoded_sensor','x','y','z','scaled_features'], outputCol='assembled_features')

In [None]:
#random forest
rf = RandomForestClassifier(labelCol="gt_label", featuresCol="assembled_features")
main_pipe = Pipeline(stages = [category_indexer,sensor_indexer,va_xyz,sc_all,va,rf])

if grid_search == True:
  gridRF = ParamGridBuilder().\
  addGrid(main_pipe.getStages()[5].numTrees, [5,10,15,20,25]).build()
  all_models = []
  for j in range(len(gridRF)):
      #print("Fitting model {}".format(j+1))
      model = main_pipe.fit(train, gridRF[j])
      all_models.append(model)
  # estimate the accuracy of each of them:
  accuracies = [m.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).first().accuracy for m in all_models]
  best_model_idx = np.argmax(accuracies)
  #printing best model index and parameters
  print("best model index =", best_model_idx)
  print("best model index =", gridRF[best_model_idx])
  best_model = all_models[best_model_idx]
  best_model.transform(test).select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()
else:
  rf = RandomForestClassifier(labelCol="gt_label", featuresCol="assembled_features", numTrees=20)
  main_pipe = Pipeline(stages = [category_indexer,sensor_indexer,va_xyz,sc_all,va,rf])
  RFModel = main_pipe.fit(train)
  predictions = RFModel.transform(test)
  predictions.select(f.avg(f.expr('float(gt_label = prediction)')).alias('accuracy')).show()

In [None]:
# Error Analysis
print('Label - Stairsup')
print('correct :',predictions[(predictions.gt_label == 0) & (predictions.prediction == 0)].count())
print('incorrect :',predictions[(predictions.gt_label == 0) & (predictions.prediction != 0)].count())
print('*******************')
print('Label - Sit')
print('correct :',predictions[(predictions.gt_label == 1) & (predictions.prediction == 1)].count())
print('incorrect :',predictions[(predictions.gt_label == 1) & (predictions.prediction != 1)].count())
print('*******************')
print('Label - Stand')
print('correct :',predictions[(predictions.gt_label == 2) & (predictions.prediction == 2)].count())
print('incorrect :',predictions[(predictions.gt_label == 2) & (predictions.prediction != 2)].count())
print('*******************')
print('Label - Walk')
print('correct :',predictions[(predictions.gt_label == 3) & (predictions.prediction == 3)].count())
print('incorrect :',predictions[(predictions.gt_label == 3) & (predictions.prediction != 3)].count())
print('*******************')
print('Label - Bike')
print('correct :',predictions[(predictions.gt_label == 4) & (predictions.prediction == 4)].count())
print('incorrect :',predictions[(predictions.gt_label == 4) & (predictions.prediction != 4)].count())
print('*******************')
print('Label - Stairsdown')
print('correct :',predictions[(predictions.gt_label == 5) & (predictions.prediction == 5)].count())
print('incorrect :',predictions[(predictions.gt_label == 5) & (predictions.prediction != 5)].count())
print('*******************')
print('Label - NULL')
print('correct :',predictions[(predictions.gt_label == 6) & (predictions.prediction == 6)].count())
print('incorrect :',predictions[(predictions.gt_label == 6) & (predictions.prediction != 6)].count())