In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import pickle
from sklearn import svm, metrics
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline 

In [2]:
filedir = ".\\data\\"
activities = ['cycling', 'driving', 'jogging', 'sleeping', 'walking']
epoch_length = 360

In [3]:
pickle_in = open(filedir+"data\\data.pickle","rb")
data_dict = pickle.load(pickle_in)
pickle_in.close()
X = data_dict["svm_array"]

In [4]:
y = data_dict["label"]

---

## Data Split 90/10

In [5]:
test_size = int(int(X.shape[0] * 0.1) / len(set(y))) * len(set(y))
test_size

810

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=19011954, stratify=y)

## SVM 5

In [7]:
# Create a classifier: a support vector classifier
SVM_classifier = svm.SVC(kernel='rbf', C=1.0, gamma = "auto", random_state=19011954)
#fit to the trainin data
SVM_classifier.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=19011954, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
y_pred_SVM = SVM_classifier.predict(X_test)

In [9]:
accuracy_SVM1 = metrics.accuracy_score(y_test, y_pred_SVM)
accuracy_SVM1

0.8728395061728395

In [10]:
results = open("accuracy_svm5.txt","w")
results.write(str(accuracy_SVM1))
results.close()

In [11]:
print("Classification report for classifier %s:\n%s\n"
      % (SVM_classifier, metrics.classification_report(y_test, y_pred_SVM, digits = 4)))

Classification report for classifier SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=19011954, shrinking=True,
    tol=0.001, verbose=False):
              precision    recall  f1-score   support

     cycling     0.7738    0.8025    0.7879       162
     driving     0.7485    0.7716    0.7599       162
     jogging     0.9701    1.0000    0.9848       162
    sleeping     0.9796    0.8889    0.9320       162
     walking     0.9068    0.9012    0.9040       162

    accuracy                         0.8728       810
   macro avg     0.8758    0.8728    0.8737       810
weighted avg     0.8758    0.8728    0.8737       810




In [12]:
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, y_pred_SVM))

Confusion matrix:
[[130  24   2   0   6]
 [ 24 125   2   3   8]
 [  0   0 162   0   0]
 [ 11   5   1 144   1]
 [  3  13   0   0 146]]


## Optimizing SVM

In [13]:
## https://medium.com/@aneesha/svm-parameter-tuning-in-scikit-learn-using-gridsearchcv-2413c02125a0
from sklearn.model_selection import GridSearchCV
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf', random_state=19011954), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [14]:
svc_param_selection(X_train, y_train, 5)

{'C': 10, 'gamma': 0.1}

---

In [15]:
# Create a classifier: a support vector classifier
SVM_classifier_optimized = svm.SVC(kernel='rbf', C=10, gamma = 0.1, random_state=19011954)
#fit to the trainin data
SVM_classifier_optimized.fit(X_train,y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=19011954, shrinking=True,
    tol=0.001, verbose=False)

In [16]:
y_pred_SVM_optimized = SVM_classifier_optimized.predict(X_test)

In [17]:
accuracy_SVM_optimized = metrics.accuracy_score(y_test, y_pred_SVM_optimized)
accuracy_SVM_optimized

0.9111111111111111

In [18]:
results = open("accuracy_svm_optimized.txt","w")
results.write(str(accuracy_SVM_optimized))
results.close()

In [19]:
print("Classification report for classifier %s:\n%s\n"
      % (SVM_classifier, metrics.classification_report(y_test, y_pred_SVM_optimized, digits = 4)))

Classification report for classifier SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=19011954, shrinking=True,
    tol=0.001, verbose=False):
              precision    recall  f1-score   support

     cycling     0.8696    0.8642    0.8669       162
     driving     0.8036    0.8333    0.8182       162
     jogging     0.9529    1.0000    0.9759       162
    sleeping     0.9934    0.9321    0.9618       162
     walking     0.9434    0.9259    0.9346       162

    accuracy                         0.9111       810
   macro avg     0.9126    0.9111    0.9115       810
weighted avg     0.9126    0.9111    0.9115       810




In [20]:
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, y_pred_SVM_optimized))

Confusion matrix:
[[140  20   2   0   0]
 [ 15 135   4   1   7]
 [  0   0 162   0   0]
 [  2   5   2 151   2]
 [  4   8   0   0 150]]


## Random Forest 5

In [21]:
# Create a classifier: a random forest classifier
RF_classifier = RandomForestClassifier(n_estimators = 1000, random_state = 19011954)
#fit to the trainin data
RF_classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=19011954,
                       verbose=0, warm_start=False)

In [22]:
y_pred_RF = RF_classifier.predict(X_test)

In [23]:
accuracy_RF1 = metrics.accuracy_score(y_test, y_pred_RF)
accuracy_RF1

0.9567901234567902

In [24]:
results = open("accuracy_rf5.txt","w")
results.write(str(accuracy_RF1))
results.close()

In [25]:
print("Classification report for classifier %s:\n%s\n"
      % (RF_classifier, metrics.classification_report(y_test, y_pred_RF, digits = 4)))

Classification report for classifier RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=19011954,
                       verbose=0, warm_start=False):
              precision    recall  f1-score   support

     cycling     0.9608    0.9074    0.9333       162
     driving     0.8902    0.9506    0.9194       162
     jogging     1.0000    1.0000    1.0000       162
    sleeping     0.9936    0.9630    0.9781       162
     walking     0.9455    0.9630    0.9541       162

    accuracy                         0.9568       810
   macro avg     0.9580    0.9

In [26]:
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, y_pred_RF))

Confusion matrix:
[[147  13   0   1   1]
 [  3 154   0   0   5]
 [  0   0 162   0   0]
 [  1   2   0 156   3]
 [  2   4   0   0 156]]


## Optimizing Random Forest

In [27]:
## https://medium.com/@aneesha/svm-parameter-tuning-in-scikit-learn-using-gridsearchcv-2413c02125a0
from sklearn.model_selection import GridSearchCV
def rf_param_selection(X, y, nfolds):
    param_grid = {'bootstrap': [True],
    'max_depth': [80, 90, 100],
    'max_features': [2, 3],
    'min_samples_leaf': [1, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]}
    grid_search = GridSearchCV(RandomForestClassifier(random_state = 19011954), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [28]:
rf_param_selection(X_train, y_train, 5)

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

---

In [29]:
# Create a classifier: a random forest classifier
RF_classifier_optimized = RandomForestClassifier(max_depth = 80, max_features = 3, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 200, random_state = 19011954)
#fit to the trainin data
RF_classifier_optimized.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=80, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=19011954,
                       verbose=0, warm_start=False)

In [30]:
y_pred_RF_optimized = RF_classifier.predict(X_test)

In [31]:
accuracy_RF_optimized = metrics.accuracy_score(y_test, y_pred_RF_optimized)
accuracy_RF_optimized

0.9567901234567902

In [32]:
results = open("accuracy_rf_optimized.txt","w")
results.write(str(accuracy_RF_optimized))
results.close()

In [33]:
print("Classification report for classifier %s:\n%s\n"
      % (RF_classifier, metrics.classification_report(y_test, y_pred_RF_optimized, digits = 4)))

Classification report for classifier RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=19011954,
                       verbose=0, warm_start=False):
              precision    recall  f1-score   support

     cycling     0.9608    0.9074    0.9333       162
     driving     0.8902    0.9506    0.9194       162
     jogging     1.0000    1.0000    1.0000       162
    sleeping     0.9936    0.9630    0.9781       162
     walking     0.9455    0.9630    0.9541       162

    accuracy                         0.9568       810
   macro avg     0.9580    0.9

In [34]:
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, y_pred_RF_optimized))

Confusion matrix:
[[147  13   0   1   1]
 [  3 154   0   0   5]
 [  0   0 162   0   0]
 [  1   2   0 156   3]
 [  2   4   0   0 156]]
