In [14]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn import preprocessing
from sklearn import impute
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn import metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

In [15]:
#Load in training data
raw_data = pd.read_csv('./data/movementSensorData.csv')

time = raw_data.iloc[:, 2] # just time
data = raw_data.iloc[:, 3:5] #data minus activity
labels = raw_data.iloc[:, 1] #just activity

Pre-processing

In [16]:
#imputing if necessary
#Impute missing data using K Nearest Neighbours (n=5)
#imputer = impute.KNNImputer(missing_values=np.nan, n_neighbors=5)
#imputed_data = pd.DataFrame(imputer.fit_transform(all_data))

In [17]:
normaliser = preprocessing.Normalizer()
data = pd.DataFrame(normaliser.fit_transform(data))

In [18]:
#scale std and mean
scaler = preprocessing.StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data))

In [19]:
#feature selection if necessary
#selector = VarianceThreshold()
#feature_selected_data = selector.fit_transform(scaled_data, all_labels)

In [20]:
#Time clustering (group together multiple datapoints in the same timeframe)

Validation

In [31]:
# Some functions to nicely present findings
def print_metrics(truth, predictions):
  metrics.ConfusionMatrixDisplay.from_predictions(truth, predictions)
  print('Accuracy', metrics.accuracy_score(truth, predictions))
  print('Precision', metrics.precision_score(truth, predictions))
  print('Recall', metrics.recall_score(truth, predictions))
  print('F1 Score', metrics.f1_score(truth, predictions))

def plot_classifiers_figure(train_data, test_data, train_labels, test_labels, names, classifiers):
  fig = plt.figure(figsize=(10,5))
  ax = fig.add_axes([0,0,1,1])
  accuracy, precision, recall, f1 = [],[],[],[]
  for name, clf in zip(names, classifiers):
    clf.fit(train_data, train_labels)
    predictions = clf.predict(test_data)

    accuracy.append(metrics.accuracy_score(test_labels, predictions))
    precision.append(metrics.precision_score(test_labels, predictions, average='weighted'))
    recall.append(metrics.recall_score(test_labels, predictions, average='weighted'))
    f1.append(metrics.f1_score(test_labels, predictions, average='weighted'))

  x = np.arange(len(classifiers))
  ax.bar(x-0.30, accuracy, color='r', width=0.20, label='Accuracy')
  ax.bar(x-0.10, precision, color='b', width=0.20, label='Precision')
  ax.bar(x+0.10, recall, color='g', width=0.20, label='Recall')
  ax.bar(x+0.30, f1, color = 'y', width=0.20, label='F1 Score')

  plt.xticks(x, names, rotation=15)
  ax.set_yticks(np.arange(0, 1.05, 0.05))
  ax.legend(title="Metrics", loc='upper left')
  ax.grid(alpha=0.5, axis='y')
  plt.show()

def plot_data_figure(train_data, test_data, train_labels, test_labels, classifier):
  fig = plt.figure()
  ax = fig.add_axes([0,0,1,1])
  accuracy, precision, recall, f1 = [],[],[],[]
  j = 200
  for i in range(500, 3001, 500):
    classifier.fit(train_data[:i], train_labels[:i])
    predictions = classifier.predict(test_data[:j])

    accuracy.append(metrics.accuracy_score(test_labels[:j], predictions))
    precision.append(metrics.precision_score(test_labels[:j], predictions, average='weighted'))
    recall.append(metrics.recall_score(test_labels[:j], predictions, average='weighted'))
    f1.append(metrics.f1_score(test_labels[:j], predictions, average='weighted'))
    j += 200

  x = np.arange(len(accuracy))
  ax.bar(x-0.30, accuracy, color='r', width=0.20, label='Accuracy')
  ax.bar(x-0.10, precision, color='b', width=0.20, label='Precision')
  ax.bar(x+0.10, recall, color='g', width=0.20, label='Recall')
  ax.bar(x+0.30, f1, color = 'y', width=0.20, label='F1 Score')

  plt.xticks(x, [i for i in range(500, 3001, 500)], rotation=15)
  ax.set_yticks(np.arange(0, 1.05, 0.05))
  ax.legend(title="Metrics", loc='upper left')
  ax.grid(alpha=0.5, axis='y')
  plt.show()

In [22]:
tdata, vdata, tlabels, vlabels = train_test_split(data, labels, test_size=0.2, shuffle=True, random_state=1452) #get train test split

In [25]:
names = ['Multi Layer Perceptron', 'Tuned Multi Layer Perceptron',
         'Logistic Regression', 'Tuned Logistic Regression', 'Source Vector Classifier', 'Tuned Source Vector Classifier']
classifiers = [
  MLPClassifier(),
  MLPClassifier(hidden_layer_sizes=(25,), activation='relu', solver='adam', learning_rate='adaptive', alpha=0.001, batch_size=100),
  LogisticRegression(max_iter=1000), #had to up max iter or it wouldn't converge
  LogisticRegression(solver='newton-cg', fit_intercept=False),
  SVC(),
  SVC(kernel='sigmoid'),
  RandomForestClassifier(),
  LinearRegression()
]

In [32]:
plot_classifiers_figure(tdata, vdata, tlabels, vlabels, names, classifiers) #Plot a comparison of my selected classifiers

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#final_pred.to_csv(filepath + 'predictions.csv', index=False)