Lifting coach preliminary analysis.

In [1]:
# Import basic libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn
import statistics
from scipy import stats

# Import machine learning libraries
from sklearn.metrics import accuracy_score, cohen_kappa_score, precision_score, recall_score, roc_auc_score, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Load the data into python 
Lift_Data = pd.read_csv("D:/Datasets/LiftingCoach/DataTest.csv")
# Sample rate 120 hz

In [3]:
Lift_Data.head()

Unnamed: 0,EventActNum,FP34ok,LoadOnly,aCOM_TNH_x,aCOM_TNH_y,aCOM_TNH_z,aLHAND_x,aLHAND_y,aLHAND_z,aRHAND_x,aRHAND_y,aRHAND_z,aLANK_x,aLANK_y,aLANK_z,aRANK_x,aRANK_y,aRANK_z
0,1,0.52685,-978.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,1,0.48161,-978.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,1,0.44164,-978.45,-0.073276,0.22819,-0.054484,0.936,-0.7884,-0.3744,2.7936,-2.2212,1.44,-2.4552,0.9108,-6.1632,0,0,0
3,1,0.40773,-978.46,-0.07453,0.20628,-0.059563,0.8532,-0.4464,-0.4284,1.2024,-2.1096,0.4572,2.9016,-0.702,5.5296,0,0,0
4,1,0.38021,-978.46,-0.076338,0.19005,-0.063538,0.3636,0.054,-0.0396,-0.1836,-0.162,-0.45,1.1664,-0.576,2.6388,0,0,0


For the purpose of this preliminary data, I will focus on the variables that can be obtained from the wearable sensors. These variables include LoadOnly, aCOM_TNH_x, aCOM_TNH_y, aCOM_TNH_z, aLHAND_x, aLHAND_y, aLHAND_z, aLANK_x, aLANK_y and aLANK_z.

In [4]:
"""
The sliding_window function uses the inputs data, events, an epoch length (seconds), overlap (percentage of overlap) and 
sample rate as a way to generate a dataframe for machine learning.
"""
def sliding_Window(data, events, epoch_Length, overlap, sample_Rate):
    
    # Trial information
    data_Header = data.columns
    number_Of_Samples = len(events) 
    trial_Length = len(events)/sample_Rate
    slide = epoch_Length - (epoch_Length*overlap)
    
    # Define Start and Stop
    n = round(((number_Of_Samples - epoch_Length*sample_Rate)/(slide*sample_Rate)),0)-1
    start = [round(x*slide, 2) for x in range(int(n))]
    stop = [round(x*slide+epoch_Length, 2) for x in range(int(n))]
    
    # Define Mean and Stardard Deviation of data during an epoch
    mean_Data = [[] for j in range(len(data.columns))]
    mean_Names = ['mean_' + x for x in data_Header]
    std_Data = [[] for k in range(len(data.columns))]
    std_Names = ['std_' + x for x in data_Header]
    
    for item in range(len(data.columns)):
        for i in range(len(start)):
            current_Mean = data.iloc[int(start[i]*sample_Rate):int(stop[i]*sample_Rate), item].mean()
            current_STD = data.iloc[int(start[i]*sample_Rate):int(stop[i]*sample_Rate), item].std()
            mean_Data[item].append(current_Mean)
            std_Data[item].append(current_STD)
    
    # Create dataframes for the data
    start_D = pd.DataFrame(start)
    start_D.columns = ['Start']
    stop_D = pd.DataFrame(stop)
    stop_D.columns = ['Stop']
    
    mean_D = pd.DataFrame(mean_Data)
    mean_D = mean_D.transpose()
    mean_D.columns = mean_Names

    std_D = pd.DataFrame(std_Data)
    std_D = std_D.transpose()
    std_D.columns = std_Names 

    revised_Data = pd.concat([start_D, stop_D, mean_D, std_D], axis=1)
    return revised_Data

#j = sliding_Window(Lift_Data, events, 1.5, 0.9, 120)

In [5]:
def sliding_Window_Events(events, epoch_Length, overlap, sample_Rate):
    number_Of_Samples = len(events) 
    trial_Length = len(events)/sample_Rate
    slide = epoch_Length - (epoch_Length*overlap)
    
    # Define Start and Stop
    n = round(((number_Of_Samples - epoch_Length*sample_Rate)/(slide*sample_Rate)),0)-1
    start = [round(x*slide, 2) for x in range(int(n))]
    stop = [round(x*slide+epoch_Length, 2) for x in range(int(n))]
    
    revised_Events = []
    for i in range(len(start)):
        current_Epoch = events.iloc[int(start[i]*sample_Rate):int(stop[i]*sample_Rate)]
        current_Event = np.bincount(current_Epoch).argmax()
        revised_Events.append(current_Event)
    
    
    event_Data = pd.DataFrame(revised_Events)
    event_Data.columns = ['Events']
    return event_Data
    
#k = sliding_Window_Events(events, 1.5, 0.9, 120)

In [6]:
def dummy_Events(events):
    # 1-Stand-Free, 3-Lift-Squat, 5-Lower-Squat
    event_Data = pd.get_dummies(events['Events'])
    event_Data.columns = ['Stand_Free', 'Lift_Squat', 'Lower_Squat']
    return event_Data

In [7]:
# Load the data into python 
Lift_Data = pd.read_csv("D:/Datasets/LiftingCoach/DataTest.csv")
# Sample rate 120 hz
data = Lift_Data[Lift_Data.columns[1:]]
events = Lift_Data['EventActNum']

In [11]:
def initialize_Event_Array(lengths, overlaps, list_Events, algorithms, sample_Rate):
    # Define the matrix dimensions
    maxO = max(overlaps)
    minL = min(test_Lengths)
    slide = minL - (minL*maxO)
    
    # Define Start
    number_Of_Samples = len(events)
    n = round(((number_Of_Samples - minL*sample_Rate)/(slide*sample_Rate)),0)-1
    start = [round(x*slide, 2) for x in range(int(n))]
    
    generated_List = []
    # Create an array of zeros
    for current_Lengths in lengths:
        for current_Overlap in overlaps:
            for current_List in list_Events:
                for current_Algorithms in algorithms:
                    name = [current_List, current_Algorithms, str(current_Lengths), str(current_Overlap)]
                    current_Column = '_'.join(name)
                    generated_List.append(current_Column)
    feature_List = ['Start','Tested_Epochs', list_Events[0], list_Events[1], list_Events[2]] + generated_List
    building_Array = pd.DataFrame(0, index=np.arange(len(start)), columns=feature_List)
    building_Array['Start'] = start
    return building_Array

# test_Lengths = [1.5, 2.0, 2.5, 3.0]
# overlaps = [0.5, 0.9]
# list_Events = ['Stand_Free', 'Lift_Squat', 'Lower_Squat']
# algorithms = ['LDA', 'KNN', 'NB', 'QDA', 'LR']
# tt = initialize_Event_Array(test_Lengths, overlaps, list_Events, algorithms, 120)

In [12]:
def change_To_Time(index, my_Epoch_Length, my_Overlap, events):
    number_Of_Samples = len(events) 
    trial_Length = len(events)/sample_Rate
    slide = my_Epoch_Length - (my_Epoch_Length*my_Overlap)
    
    # Define Start and Stop
    n = round(((number_Of_Samples - my_Epoch_Length*sample_Rate)/(slide*sample_Rate)),0)-1
    start = [round(x*slide, 2) for x in range(int(n))]

In [31]:
# Create empty lists to append to.
names = []
Event = []
Chance = []
Overlap = []
Lengths = []
Accuracy = []
TestAcc = []
Precision = []
Recall = []
Kappa = []

# Create test conditions to go into the loop
#test_Lengths = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
#overlaps = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

test_Lengths = [2.0]
overlaps = [0.5, 0.9]
list_Events = ['Stand_Free', 'Lift_Squat', 'Lower_Squat']
algorithms = ['LDA', 'KNN', 'NB', 'QDA', 'LR']
cv = ShuffleSplit(5, test_size=0.2, random_state=42)

events_Array = initialize_Event_Array(test_Lengths, overlaps, list_Events, algorithms, 120)

for my_Event in list_Events:
    for my_Epoch_Length in test_Lengths:
        for my_Overlap in overlaps:

            # Create dataset for machine learning
            labels = sliding_Window_Events(events, my_Epoch_Length, my_Overlap, 120)
            event = dummy_Events(labels)
            chosen_Event = event[my_Event] # 1-Stand_Free, 3-Lift_Squat, 5-Lower_Squat
            current_Event = my_Event
            #n = round(((number_Of_Samples - minL*sample_Rate)/(slide*sample_Rate)),0)-1
            data_Frame = pd.DataFrame(sliding_Window(data, events, my_Epoch_Length, my_Overlap, 120))
            data_Only = data_Frame[data_Frame.columns[2:]]

            # Define the parameters
            svcParameters = [{'gamma': 10.0 ** np.arange(-5, 4), 'C': 10.0 ** np.arange(-2, 7)}]
            ldaParameters = [{'n_components' : [10, 20, 30, 40], 'solver': ['svd', 'lsqr', 'eigen']}]
            knnParameters = [{'n_neighbors' : [5, 10, 15, 20]}]
            #artParameters = [{'max_features' : [2, 4, 6, 8, 10]}]
            #fParameters = [{'max_features': ["sqrt", "log2", None, 1], 'max_depth':[None, 15, 30, 50], 'min_samples_leaf': [2, 5, 10]}]

            # Assemble default classifiers
            svc = SVC(probability=True)
            lda = LinearDiscriminantAnalysis()
            knn = KNeighborsClassifier()
            #cart = DecisionTreeClassifier()
            nb = GaussianNB()
            qda = QuadraticDiscriminantAnalysis()
            lr = LogisticRegression()
            #rf = RandomForestClassifier()

            # Assemble grid parameter classifiers
            svc2 = GridSearchCV(SVC(probability=True), svcParameters, cv=cv, scoring='accuracy')
            lda2 = GridSearchCV(LinearDiscriminantAnalysis(), ldaParameters, cv=cv, scoring='accuracy')
            knn2 = GridSearchCV(KNeighborsClassifier(), knnParameters, cv=cv, scoring='accuracy')
            #cart2 = GridSearchCV(DecisionTreeClassifier(), cartParameters, cv=cv, scoring='accuracy')
            #nb2 = GridSearchCV(GaussianNB(), nbParameters, cv=cv, scoring='accuracy')
            #qda2 = GridSearchCV(QuadraticDiscriminantAnalysis(), qdaParameters, cv=cv, scoring='accuracy')
            #rf2 = GridSearchCV(RandomForestClassifier(), rfParameters, cv=cv, scoring='accuracy')
            
            # Split the test and train datasets
            for train_idx, test_idx in cv.split(labels):
                y_train, y_test = chosen_Event.loc[train_idx], chosen_Event.loc[test_idx]
                X_train, X_test = data_Only.loc[train_idx,:], data_Only.loc[test_idx,:]
                
            # Perform the machine learning
            currentName = algorithms
            #currentName = ['LDA']
            #classifier = [ Pipeline([('KNN', knn2)]), Pipeline([('QDA', qda)]), Pipeline([('LR', lr)])]
            classifier = [Pipeline([('LDA', lda)]), Pipeline([('KNN', knn)]), Pipeline([('NB', nb)]), Pipeline([('QDA', qda)]), Pipeline([('LR', lr)])]
            #classifier = [Pipeline([('LDA', lda)])]
            count = 0
            for my_Classifier in classifier:
                sAccuracy = cross_val_score(my_Classifier, X_train, y_train, scoring = 'accuracy', cv=cv, n_jobs=1)

                # Printing the results
                labels = chosen_Event
                class_balance = np.mean(labels == labels[0])
                class_balance = np.mean(labels)
                class_balance = max(class_balance, 1. - class_balance)
                y_pred = my_Classifier.fit(X_train, y_train).predict(X_test)
                y_pred_prob = my_Classifier.predict_proba(X_test)
                #y_pred_class = binarize(y_pred_prob, 0.5)[:,0] # Predict events labeled as 1 not 2 which in this case is zero.

                rocTest = (1-y_test)+1
                #auc = roc_auc_score(rocTest, y_pred_class)
                tacc = accuracy_score(y_test, y_pred)
                prec = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)
                kappa = cohen_kappa_score(y_test, y_pred)

                names.append(currentName[count])
                Event.append(current_Event)
                Lengths.append(my_Epoch_Length)
                Chance.append(class_balance)
                Overlap.append(my_Overlap)
                Accuracy.append(np.mean(sAccuracy))
                TestAcc.append(tacc)
                Precision.append(prec)
                Recall.append(recall)
                #AUC.append(auc)
                Kappa.append(kappa)
                count = count + 1
                
                header_Name = [my_Event, list(my_Classifier.named_steps.keys())[0], str(my_Epoch_Length), str(my_Overlap)]
                current_Column = '_'.join(header_Name)
                
                
                # Fix This
                #events_Array['tested_Column'][[test_idx]] = 1
                #events_Array[current_Column][[test_idx]] = y_pred

#myData = {'Name' : names, 'Event': Event, 'Epochs' : Lengths, 'Chance' : Chance, 'Accuracy' : Accuracy, 'TestAcc' : TestAcc, 'Precision': Precision, 'Recall': Recall, 'AUC' : AUC, 'Kappa': Kappa}
myData = {'Name' : names, 'Event': Event, 'Epochs' : Lengths, 'Overlap' : Overlap,  'Accuracy' : Accuracy, 'TestAcc' : TestAcc, 'Precision': Precision, 'Recall': Recall,  'Kappa': Kappa}

test5 = pd.DataFrame(myData, columns = ['Name', 'Event', 'Epochs', 'Overlap', 'Accuracy', 'TestAcc', 'Precision', 'Recall', 'Kappa'])
#test5.to_csv('testing.csv')





In [32]:
test5

Unnamed: 0,Name,Event,Epochs,Overlap,Accuracy,TestAcc,Precision,Recall,Kappa
0,LDA,Stand_Free,2.0,0.5,0.861818,0.955882,0.959184,0.979167,0.892178
1,KNN,Stand_Free,2.0,0.5,0.818182,0.779412,0.866667,0.8125,0.491018
2,NB,Stand_Free,2.0,0.5,0.843636,0.823529,0.909091,0.833333,0.598425
3,QDA,Stand_Free,2.0,0.5,0.858182,0.897059,1.0,0.854167,0.775047
4,LR,Stand_Free,2.0,0.5,0.912727,0.897059,0.918367,0.9375,0.748414
5,LDA,Stand_Free,2.0,0.9,0.903676,0.88563,0.877729,0.948113,0.750418
6,KNN,Stand_Free,2.0,0.9,0.909559,0.926686,0.934884,0.948113,0.843426
7,NB,Stand_Free,2.0,0.9,0.85,0.826979,0.927374,0.783019,0.649708
8,QDA,Stand_Free,2.0,0.9,0.899265,0.88563,0.98324,0.830189,0.768451
9,LR,Stand_Free,2.0,0.9,0.951471,0.953079,0.957944,0.966981,0.899945


In [98]:
test5['Epochs'][[2,3,4,5]]=[1, 2, 3, 4]
test5['Epochs'][[2,3,4,5]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


2    1.0
3    2.0
4    3.0
5    4.0
Name: Epochs, dtype: float64

In [198]:
classifier = [Pipeline([('LDA', lda)]), Pipeline([('QDA', qda)])]

In [199]:
len(classifier)

2

In [205]:
classifier[1]

TypeError: 'Pipeline' object is not subscriptable

In [223]:
jupyter nbconvert --to html Lifiting%20Coach%20Preliminary%20Analysis.ipynb

SyntaxError: invalid syntax (<ipython-input-223-781f60392dc7>, line 1)