In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import pickle

import torch
from torch import nn
import torchvision
import torchvision.transforms as transforms
from tqdm import notebook
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, precision_score, recall_score


if torch.cuda.is_available():
  device = torch.device("cuda:0")
  print("Running on CUDA: ", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
  device = torch.device("cpu")
  print("Running on CPU")

Running on CUDA:  NVIDIA GeForce RTX 4070


In [2]:
#Load in training data
df = pd.read_csv('./data/movementSensorData.csv')

### Exploring the data

In [3]:
df.head()

Unnamed: 0,id,activity,time_s,lw_x,lw_y,lw_z
0,63804,2,638.05,-0.188,-0.941,-0.316
1,63805,2,638.06,-0.121,-0.879,-0.32
2,63806,2,638.07,-0.07,-0.852,-0.305
3,63807,2,638.08,-0.023,-0.879,-0.277
4,63808,2,638.09,0.008,-0.941,-0.242


In [4]:
df.shape

(507827, 6)

In [5]:
df.describe()

Unnamed: 0,id,activity,time_s,lw_x,lw_y,lw_z
count,507827.0,507827.0,507827.0,507827.0,507827.0,507827.0
mean,167785.10183,4.191809,1677.861018,-0.211302,-0.021941,-0.477602
std,63888.316941,8.785676,638.883169,0.52407,0.727952,0.443465
min,63804.0,1.0,638.05,-5.289,-5.305,-6.875
25%,89195.0,2.0,891.96,-0.734,-0.219,-0.828
50%,188844.0,4.0,1888.45,-0.098,0.184,-0.57
75%,220583.0,4.0,2205.84,0.176,0.426,-0.148
max,252322.0,77.0,2523.23,5.516,4.418,4.551


In [6]:
df.isnull().sum()

id          0
activity    0
time_s      0
lw_x        0
lw_y        0
lw_z        0
dtype: int64

In [7]:
df['activity'].unique()

array([ 2, 77,  1,  3,  4], dtype=int64)

So we have activites 1, 2, 3, 4, and 77. From the source (https://physionet.org/content/accelerometry-walk-climb-drive/1.0.0/#files) we know that these are:
- 1 Walking
- 2 Descending Stairs
- 3 Ascending Stairs
- 4 Driving
- 77 Clapping


### Preprocessing

In [8]:
scaler=StandardScaler()
scaler.fit(df)
scaler_train = scaler.transform(df)

In [9]:
X = df.iloc[:, 2:6] #time and data minus activity
y = df.iloc[:, 1] #just activity
print(X)
print(y)

        time_s   lw_x   lw_y   lw_z
0       638.05 -0.188 -0.941 -0.316
1       638.06 -0.121 -0.879 -0.320
2       638.07 -0.070 -0.852 -0.305
3       638.08 -0.023 -0.879 -0.277
4       638.09  0.008 -0.941 -0.242
...        ...    ...    ...    ...
507822  963.87 -0.012  0.984 -0.363
507823  963.88  0.016  0.938 -0.379
507824  963.89  0.039  0.910 -0.391
507825  963.90  0.066  0.898 -0.395
507826  963.91  0.105  0.895 -0.398

[507827 rows x 4 columns]
0         2
1         2
2         2
3         2
4         2
         ..
507822    1
507823    1
507824    1
507825    1
507826    1
Name: activity, Length: 507827, dtype: int64


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5, random_state=101)

### Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
#random_forest = RandomForestClassifier(max_depth=16, random_state=1452, n_estimators=1000)
#random_forest.fit(X_train, y_train) 

In [12]:
#y_pred = random_forest.predict(X_validation)
#print('accuracy', metrics.accuracy_score(y_validation, y_pred))
#print('f1', metrics.f1_score(y_validation, y_pred, average='weighted'))
#accuracy 0.9750113226867259
#f1 0.9736860087875407

In [13]:
#with open('models/random_forest.pickle', 'wb') as rf_file:
#    pickle.dump(random_forest, rf_file)

### Decision Tree

In [14]:
from sklearn import tree
#decision_tree = tree.DecisionTreeClassifier(random_state=1452, max_depth=64, criterion='entropy')
#decision_tree.fit(X_train, y_train)

In [15]:
#y_pred = decision_tree.predict(X_validation)
#print('accuracy', metrics.accuracy_score(y_validation, y_pred))
#print('f1', metrics.f1_score(y_validation, y_pred, average='weighted'))
##accuracy 0.9880668727723845
##f1 0.9880127222223519

In [16]:
#with open('models/decision_tree.pickle', 'wb') as rf_file:
#    pickle.dump(decision_tree, rf_file)

### K Nearest Neighbours

In [17]:
from sklearn.neighbors import KNeighborsClassifier
#kneighbours = KNeighborsClassifier(n_neighbors=7, weights='distance')
#kneighbours.fit(X_train, y_train)

In [18]:
#y_pred = kneighbours.predict(X_validation)
#print('accuracy', metrics.accuracy_score(y_validation, y_pred))
#print('f1', metrics.f1_score(y_validation, y_pred, average='weighted'))
##accuracy 0.9951361676151468
##f1 0.9951142459904004

In [19]:
#with open('models/kneighbours.pickle', 'wb') as rf_file:
#    pickle.dump(kneighbours, rf_file)

## Ensemble Classification


In [20]:
from sklearn.ensemble import VotingClassifier

classifiers = [
    ('Random Forest', RandomForestClassifier(max_depth=16, random_state=1452, n_estimators=1000)),
    ("Decision Tree", tree.DecisionTreeClassifier(random_state=1452, max_depth=64, criterion='entropy')),
    ("K Neighbours", KNeighborsClassifier(n_neighbors=7, weights='distance'))
]

hard_ensemble = VotingClassifier(
    estimators=classifiers,
    voting='hard')

hard_ensemble.fit(X_train, y_train)


[Voting] ............ (1 of 3) Processing Random Forest, total= 5.5min
[Voting] ............ (2 of 3) Processing Decision Tree, total=   0.6s
[Voting] ............. (3 of 3) Processing K Neighbours, total=   0.2s


In [21]:
y_pred = hard_ensemble.predict(X_validation)
print('accuracy', metrics.accuracy_score(y_validation, y_pred))
print('f1', metrics.f1_score(y_validation, y_pred, average='weighted'))

accuracy 0.9926747139790875
f1 0.9925876760232433


In [22]:
with open('models/hard_ensemble.pickle', 'wb') as rf_file:
    pickle.dump(hard_ensemble, rf_file)

### Maybe if we weight our ensemble...

In [None]:
soft_ensemble = VotingClassifier(
    estimators=classifiers,
    voting='soft',
    weights=[1, 2, 1.5]
)

soft_ensemble.fit(X_train, y_train)

In [None]:
y_pred = soft_ensemble.predict(X_validation)
print('accuracy', metrics.accuracy_score(y_validation, y_pred))
print('f1', metrics.f1_score(y_validation, y_pred, average='weighted'))

In [None]:
with open('models/soft_ensemble.pickle', 'wb') as rf_file:
    pickle.dump(soft_ensemble, rf_file)

### Or better yet, stack it?

In [26]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stacking_ensemble = StackingClassifier(
    estimators=classifiers,
    final_estimator=KNeighborsClassifier(n_neighbors=7)
)

stacking_ensemble.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_pred = stacking_ensemble.predict(X_validation)
print('accuracy', metrics.accuracy_score(y_validation, y_pred))
print('f1', metrics.f1_score(y_validation, y_pred, average='weighted'))

In [None]:
with open('models/stacking_ensemble.pickle', 'wb') as rf_file:
    pickle.dump(stacking_ensemble, rf_file)

Using my unseen test data to cross-validate and determine which ensemble method performs best. And while the performance benefits aren't necessarily huge at this point, the hope is that these should generalise better than an individual model.

In [None]:
from sklearn.model_selection import cross_val_score
ensemble_classifiers = [
    ('Hard Voting', hard_ensemble),
    ('Soft Voting', soft_ensemble),
    ('Stacking', stacking_ensemble)
]
for (label, clf) in ensemble_classifiers:
    scores = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.98 (+/- 0.00) [Random Forest]
Accuracy: 0.97 (+/- 0.00) [Decision Tree]
Accuracy: 0.97 (+/- 0.00) [K Neighbours]
[Voting] ............ (1 of 3) Processing Random Forest, total=  34.5s
[Voting] ............ (2 of 3) Processing Decision Tree, total=   0.1s
[Voting] ............. (3 of 3) Processing K Neighbours, total=   0.0s
[Voting] ............ (1 of 3) Processing Random Forest, total=  35.4s
[Voting] ............ (2 of 3) Processing Decision Tree, total=   0.1s
[Voting] ............. (3 of 3) Processing K Neighbours, total=   0.0s
[Voting] ............ (1 of 3) Processing Random Forest, total=  34.9s
[Voting] ............ (2 of 3) Processing Decision Tree, total=   0.1s
[Voting] ............. (3 of 3) Processing K Neighbours, total=   0.0s
[Voting] ............ (1 of 3) Processing Random Forest, total=  34.5s
[Voting] ............ (2 of 3) Processing Decision Tree, total=   0.1s
[Voting] ............. (3 of 3) Processing K Neighbours, total=   0.0s
[Voting] ............ (