<a href="https://colab.research.google.com/github/dwittaker/CSE535/blob/master/KERAS_Test_CSE535_Project_2_post_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Header

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 14 08:38:55 2020

This script makes use of JSON file based data (with posenet output) for the classification of sign language videos.
It commences with pre-processing of data including extraction of the data into a useful format, 
followed by the development of features including transformation into various formats.
Subsequently, the data is fed into various classification algorithms where we try to
classify the sign language portrayed in the video.
"""

import math
from os import walk
from os import path
import json
import pandas as pd
import numpy as np
import datetime
import pywt
from scipy.fftpack import fft
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import sklearn.preprocessing as skprep
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pickle
import keras
import keras.utils
from keras.optimizers import Adam
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('expand_frame_repr', True)
path_testdata = '/content/drive/My Drive/poseclassifier/test'
#path_trainingdata = '..\\..\\flaskr\\uploads' #original posenet upload directory. no longer used
path_trainingdata2 = '/content/drive/My Drive/poseclassifier/training'
path_tempfiles = '/content/drive/My Drive/CSE535Temp/'
#path_classifydata = '..\\..\\flaskr\\uploads\\classify' #original posenet upload directory. no longer used

#list_excludedparts = ['leftKnee', 'rightKnee', 'leftAnkle', 'rightAnkle']
list_excludedparts = []
listpartsinuse = ['nose', 'leftEye','rightEye','leftEar', 'rightEar',
                      'leftShoulder','rightShoulder', 'leftHip', 'rightHip',
                      'leftKnee', 'rightKnee', 'leftAnkle', 'rightAnkle',
                      'leftElbow', 'rightElbow', 'leftWrist', 'rightWrist'
                      ]             

list_gesture = ['BUY', 'COMMUNICATE', 'FUN', 'HOPE', 'MOTHER', 'REALLY']
#list_gesture = ['BUY', 'HOUSE', 'FUN', 'HOPE', 'ARRIVE', 'REALLY', 'READ', 'LIP', 'MOUTH', 'SOME', 'COMMUNICATE', 'WRITE', 'CREATE', 'PRETEND', 'SISTER', 'MAN', 'ONE', 'DRIVE', 'PERFECT', 'MOTHER']
#list_gestureindex = [i.key for i in enumerate(list_gesture)]
ftst = []
ftrn = []
ftrn2 = []
dict_tst = {}
dict_trn = {}
dict_trn2 = {}


# Feature Development

In [1]:
#This just returns the plain data with no transformation
def plainfunc(lst):
    # print('plain')
    return [lst, ['plot', '']]

#This transforms the data using Fast Fourier Transform
def fftfunc(lst):
    fftresult = np.abs(fft(lst))
    half = len(fftresult) // 2
    return [fftresult[half:], ['stem', '']]

#This transforms the data using basic mean (average) for each feature column
def meanfunc(lst):
    return [[sum(lst) / float(len(lst))], ['plot', 'o']]

#This transforms the data using basic standard deviation for each feature column
def stdfunc(lst):
    return [[np.std(lst)], ['plot', 'o']]

#This transforms the data using basic variance for each feature column
def varfunc(lst):
    return [[np.var(lst)], ['plot', 'o']]

#This transforms the data using Discrete Wavelets for each feature column
def dwtmultifunc(lst):
    # print('dwting')
    # w = pywt.Wavelet('db2')
    # lst2 = map(int,lst)
    #    cA, cD = pywt.dwt(lst, wavelet=w, mode='smooth')
    w = pywt.Wavelet('db1')
    cData = []
    lvl = 4
    if pywt.dwt_max_level(len(lst), w) < lvl:
        lvl = pywt.dwt_max_level(len(lst), w)
    cData = pywt.wavedec(lst, w, level=lvl)  # wavelet=w, mode='smooth')
    return [cData[0], ['plot', '']]
    # return dwt(lst)

#This transforms the data using basic version of Discrete Wavelets for each feature column
def dwtbasicfunc(lst):
    w = pywt.Wavelet('db2')
    cData = pywt.dwt(lst, w)  # wavelet=w, mode='smooth')
    return [cData[0], ['plot', '']]

#This transforms the data using basic root mean square for each feature column
def rmsfunc(lst):
    return [[np.sqrt(np.mean(np.square(lst)))], ['plot', 'o']]

#This transforms the data using the maximum value for each feature column
def maxfunc(lst):
    return [[np.max(lst)], ['plot', 'o']]

#This transforms the data using the minimum value for each feature column
def minfunc(lst):
    return [[np.min(lst)], ['plot', 'o']]

#This transforms the data using the energy function for each feature column
def energyfunc(lst):
    e = sum(np.square(np.absolute(lst)))
    return [[e], ['plot', 'o']]

#This transforms the data using diffNormRawData for each feature column
def dNRD(lst):
    return zCRMaxDfunction(lst, 'DNRD')

#This transforms the data using zeroCrossingArray for each feature column
def zCR(lst):
    return zCRMaxDfunction(lst, 'ZCR')

#This transforms the data using maxDiffArray for each feature column
def mDA(lst):
    return zCRMaxDfunction(lst, 'MDA')

#This transforms the data using DNRD, ZCR and MDA for each feature column
def ZMD(lst):
    return zCRMaxDfunction(lst, 'ALL')


#This function handles the 3 transformations above
def zCRMaxDfunction(lst, func):

    diffNormRawData = np.diff(lst)
    zeroCrossingArray = np.array([])
    maxDiffArray = np.array([])

    if diffNormRawData[0] > 0:
        initSign = 1
    else:
        initSign = 0

    windowSize = 5;

    for x in range(1, len(diffNormRawData)):
        if diffNormRawData[x] > 0:
            newSign = 1
        else:
            newSign = 0

        if initSign != newSign:
            zeroCrossingArray = np.append(zeroCrossingArray, x)
            initSign = newSign
            maxIndex = np.minimum(len(diffNormRawData), x + windowSize)
            minIndex = np.maximum(0, x - windowSize)

            maxVal = np.amax(diffNormRawData[minIndex:maxIndex])
            minVal = np.amin(diffNormRawData[minIndex:maxIndex])

            maxDiffArray = np.append(maxDiffArray, (maxVal - minVal))

    index = np.argsort(-maxDiffArray)

    #Based on the parameter provided, return the appropriately transformed data
    itm1 = list(diffNormRawData) if func in ['DNRD', 'ALL'] else []
    itm2 = list(zeroCrossingArray[index[0:5]]) if len(zeroCrossingArray) > 0 else 0.0
    itm2 = itm2 if func in ['ZCR', 'ALL'] else []
    itm3 = list(maxDiffArray[index[0:5]]) if len(maxDiffArray) > 0 else 0.0
    itm3 = itm3 if func in ['MDA', 'ALL'] else []
    
    return [[itm1, itm2, itm3], ['plot', 'o']]

def clip(l, limit):
    #Clips the sequence to the central limit values
    lo = hi = (len(l) - limit) / 2
    lo = math.ceil(lo)
    hi = math.floor(hi)
    return l[lo:-hi]

def developfeaturematrix(features, dataset, aggregate, limit):
    #names of available functions as listed above
    ftroptions = {
        'plain': plainfunc,
        'energy': energyfunc,
        'fft': fftfunc,
        'mean': meanfunc,
        'std': stdfunc,
        'var': varfunc,
        'dwt': dwtbasicfunc,
        'dwtMulti': dwtmultifunc,
        'rms': rmsfunc,
        'max': maxfunc,
        'min': minfunc,
        'zcr': zCR,
        'mda': mDA,
        'dnrd': dNRD,
        'zmd': ZMD
        #'box': boxplotfunc,

    }

    listcolumns = [col for col in dataset.columns if col not in ['Gesture', 'Sample', 'Frame']]

    testarr = []
    listofrowstoadd = np.array([])
    rowslist = []
    
    if not aggregate:
        featmatrix = []
        onerow = []
        for i_gesture in range(len(list_gesture)):
            print("Feature Creation for Gesture : " + str(i_gesture))
            totsam = len(dataset[(dataset.Gesture == i_gesture)].Sample.unique())
            for x_sample in range(totsam):
                onerow = [i_gesture, x_sample]

                for col in listcolumns:
                    lst = dataset.loc[(dataset.Gesture == i_gesture) & (dataset.Sample == x_sample), col].values
                    lst = clip(lst, limit)
                    for key, _ in ftroptions.items():
                        if key in features:
                            rslt = ftroptions[key](lst)
                            rslt2 = flatten2list(rslt[0])
                            onerow.extend(rslt2)
                featmatrix.append(flatten2list(onerow))

    else:
        print("aggregate style")
        featmatrix = []
        onerow = []
        for i_gesture in range(len(list_gesture)):
            print("Feature Creation for Gesture : " + str(i_gesture))
            totsam = len(dataset[(dataset.Gesture == i_gesture)].Sample.unique())
            for x_sample in range(totsam):
                setdata = dataset.loc[(dataset.Gesture == i_gesture) & (dataset.Sample == x_sample), listcolumns]
                onerow = [i_gesture, x_sample]
                frameend = np.max(setdata['Frame'])
                onerow.extend(setdata.mean(axis=1, skipna=True))
                featmatrix.append(flatten2list(onerow))


    return np.array(featmatrix)

#https://symbiosisacademy.org/tutorial-index/python-flatten-nested-lists-tuples-sets/
def flatten2list(object):
    gather = []
    for item in object:
        if isinstance(item, (list, tuple, set)):
            gather.extend(flatten2list(item))
        else:
            gather.append(item)
    return gather
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

def runpca(matrix, perc):
    #Use Principal Component Analysis to whittle down the available columns to 
    #determine the most likely useful columns. 
    classcol = np.array(list(zip(*matrix))[0])
    # # Y_train = [list_gesture.index(i) for i in Y_train]
    pcamatrix = np.array([item[1 - len(item):] for item in matrix])

    X_std = skprep.StandardScaler().fit_transform(pcamatrix)

    cov_mat = np.cov(X_std.T)

    eig_vals, eig_vecs = np.linalg.eig(cov_mat)

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort(key=lambda x: x[0], reverse=True)

    exp_var_percentage = perc  # Threshold of explained variance
    tot = sum(eig_vals)
    var_exp = [(i / tot) * 100 for i in sorted(eig_vals, reverse=True)]
    cum_var_exp = np.cumsum(var_exp)

    num_vec_to_keep = 0

    for index, percentage in enumerate(cum_var_exp):
        if percentage > exp_var_percentage:
            num_vec_to_keep = index + 0
            break
    print("number of vec to keep=%s" % num_vec_to_keep)

    # Compute the projection matrix based on the top eigen vectors
    num_features = X_std.shape[1]
    proj_mat = eig_pairs[0][1].reshape(num_features, 1)
    for eig_vec_idx in range(1, num_vec_to_keep):
        proj_mat = np.hstack((proj_mat, eig_pairs[eig_vec_idx][1].reshape(num_features, 1)))

    # Project the data
    pca_data = X_std.dot(proj_mat)
    # newnumcol = pca_data.shape[1]
    myclasscol = [[i] for i in classcol]
    pca_data = np.append(pca_data, myclasscol, 1)

    print("PCA Done - New Feature Matrix created")
    return pca_data


# Data preprocessing

In [None]:

def printcharts(dataset, activity):
    #Used to visually review the data for possible common patterns amongst different feature sets
    fontsize = 10
    pltsize = 18
    x = 0
    
    listcolumns = [col for col in dataset.columns if col not in ['Gesture', 'Sample', 'Frame']]
    
    list_gest = list_gesture#[:1]
    fig, axes = plt.subplots(24, 2, figsize=(pltsize, 24*4*4*2))
    rowcnt = 0
    for i_gesture in range(len(list_gest)):
            for i, col in enumerate(sorted(listcolumns)):
                rowcnt = rowcnt + 1 if x == 1 else rowcnt
                x = i % 2

                for x_sample in range(len(dataset[(dataset.Gesture == i_gesture)].Sample.unique())):
                    yvalues = dataset.loc[(dataset.Gesture == i_gesture) & (dataset.Sample == x_sample), col].values
                    xvalues = [i for i in range(len(yvalues))]
                    axes[rowcnt, x].set_title("Gesture"+ str(i_gesture) + " - " + col, loc='left', fontsize=8, position=(0.0,0.1))
                    
                    axes[rowcnt, x].grid(True)
                    axes[rowcnt, x].yaxis.set_ticks([])
                    plt.yticks(xvalues, "")
                    lines, = axes[rowcnt, x].plot(xvalues, yvalues)
                    plt.setp(lines, linewidth=1.0)
                

    plt.show()

    fig.savefig("Chart"+activity+".png")
    print(listcolumns)

def createtable(data):
    dataset = []
    i = 0
    j = 0
    for gesturekey, gesturevalues in data.items():
        if 'PRACTICE' in gesturekey:
            samplename = int(gesturekey.split('_')[2])
            gesturename = gesturekey.split('_')[0] 
        else:
            samplename = int(gesturekey.split('-')[1])
            gesturename = gesturekey.split('-')[0]
        for frame in gesturevalues:
            keypoints = frame['keypoints']
            j = 0
            for point in keypoints:
                if point['part'] not in list_excludedparts:
                    dataset.append([list_gesture.index(gesturename), samplename, i, str('{:02d}'.format(j) +'-'+ point['part']), point['position']['x'], point['position']['y']])
                    j += 1
            i += 1
        i = 0
    #print(dataset.loc[(dataset.Gesture == 0)])
    return pd.DataFrame(dataset, columns=['Gesture', 'Sample', 'Frame', 'Part', 'X', 'Y'])

def fixcolumn(choice, txt):
    #Basic data cleaning
    if choice == "partswide":
        return txt if '-' not in txt else txt.replace("(","").replace(")","").replace("'","").split(', ')[1] + "_" + txt.replace("(","").replace(")","").replace("'","").split(', ')[0]
    else:
        return txt if '-' not in txt else txt.replace("(","").replace(")","").replace("'","").split(', ')[0] + "_Frm_" + str('{:03d}').format(int(txt.replace("(","").replace(")","").replace("'","").split(', ')[1]))

def sortcolumns(numbasic, lst):
    basiclst = []
    worklst = []
    
    for i in range(len(lst)):
        if i <= (int(numbasic) - 1):
            basiclst.append(lst[i])
        else:
            worklst.append(lst[i])
    basiclst.extend(sorted(worklst))
    #print(basiclst)
    return basiclst

def normalize_data(partswide_datatbl, activity):
    #Used to normalize data based on the inherent mean, minimum and maximum values
    global listpartsinuse
    listcolumns = [col for col in partswide_datatbl.columns if col not in ['Part', 'Gesture', 'Sample', 'Frame']]
    
    print("Normalizing Data")
    print(partswide_datatbl.head)
    #samplerangelimit = 4 if activity == "TRAIN" else 2
    for i_gesture in range(len(list_gesture)):
        for x_sample in range(len(partswide_datatbl[(partswide_datatbl.Gesture == i_gesture)].Sample.unique())):
            for col in partswide_datatbl.columns:
                if col in listcolumns:
                    for part1 in listpartsinuse:
                        try:
                            workdata = partswide_datatbl.loc[(partswide_datatbl.Gesture == i_gesture) & (partswide_datatbl.Sample == x_sample) & (partswide_datatbl.Part == part1), col].values#.filter(items=[col])
                            workdata2 = (workdata - np.mean(workdata))/((np.max(workdata-np.mean(workdata))-np.min(workdata-np.mean(workdata)))+0.0000000001)
                            
                            partswide_datatbl.loc[(partswide_datatbl.Gesture == i_gesture) & (partswide_datatbl.Sample == x_sample) & (partswide_datatbl.Part == part1), col] = workdata2
                        except ValueError:  #raised if `y` is empty.
                            #print("Value data: " % (i_gesture, x_sample))#, part1 ))
                            print(workdata)
                            pass
                        
    return partswide_datatbl

def distancecalc(X1, Y1, X2, Y2):
    #Finding the distance between two objects' X and Y points
    return  np.linalg.norm(np.array(X1, Y1)- np.array(X2, Y2))

def originshiftcalc(dyn, stat1, stat2):
    return (stat1.values[0] - dyn)/(stat1.values[0] - stat2.values[0])

def normalize_to_nose(datatbl2, activity):

    #Normalize the body part positions in relation to two main body parts that will not move
    #as compared to hands. Uses the nose and the left hip as the anchor points

    global listpartsinuse
    
    datatbl = np.array(datatbl2)
    #samplerangelimit = 4 if activity == "TRAIN" else 2
    for i_gesture in range(len(list_gesture)):
        for x_sample in range(len(np.unique(datatbl[datatbl[:,0] == i_gesture][:,1]))):

            print(i_gesture, x_sample)
            setdata = datatbl[(datatbl[:,0] == i_gesture) & (datatbl[:,1] == x_sample)]
            framestart = np.min(setdata[:,2])
            frameend = np.max(setdata[:,2])
            for i in range(framestart, frameend + 1):
                workdata = setdata[setdata[:,2]== i]
                nosedata = workdata[workdata[:,3] == '00-nose']
                hipdata = workdata[workdata[:,3] == '11-leftHip']

                updatedcol = (nosedata[0][4] - workdata[:,4])/(nosedata[0][4] - hipdata[0][4])
                datatbl2.loc[(datatbl2.Gesture == i_gesture) & (datatbl2.Sample == x_sample) & (datatbl2.Frame == i), 'X'] = updatedcol

                updatedcol = (nosedata[0][5] - workdata[:,5])/(nosedata[0][5] - hipdata[0][5])
                datatbl2.loc[(datatbl2.Gesture == i_gesture) & (datatbl2.Sample == x_sample) & (datatbl2.Frame == i), 'Y'] = updatedcol

    return datatbl

def createtable_partswide(basicdata): 

    #Transforming the data from row based categories to spread out all body parts as independent feature columns
    pivoteddataset = basicdata.pivot_table(['X','Y'], ['Gesture', 'Sample', 'Frame'], 'Part',fill_value=0)
    flattened = pd.DataFrame(pivoteddataset.to_records())
    flattened.columns = [fixcolumn('partswide', col) for col in flattened.columns]
    flattened = flattened.reindex(sortcolumns(4, flattened.columns), axis=1)
    
    # normalized = normalize_data(flattened, "TRAIN")
    # return normalized
    return flattened

def createlist_frameswide(maxframecnt, partswide):

    #Transforming the partswide into a a list based format for all columns individually and pad as necessary
    #to equalize the length of all features 
    listcolumns = [col for col in partswide.columns if col not in ['Gesture', 'Sample', 'Frame']]
    testarr = []
    listofrowstoadd = np.array([])
    rowslist = []
    for i_gesture in range(len(list_gesture)):
        totsam = len(partswide[(partswide.Gesture == i_gesture)].Sample.unique())
        for x_sample in range(totsam):
            currlen = len(partswide[(partswide.Gesture == i_gesture) & (partswide.Sample == x_sample)])
            padlength = maxframecnt - currlen

            for i_pad in range(padlength):
                testarr.append(np.array([i_gesture, x_sample, currlen + i_pad,  # np.zeros(34)
                                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                                                     ]))

    print("Just before sort/add - time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
    if len(testarr) > 0:
        partswide = partswide.append(pd.DataFrame(np.array(testarr), columns=partswide.columns), ignore_index=True)
        partswide.sort_values(by=['Gesture', 'Sample', 'Frame'], inplace=True)
        partswide = partswide.reset_index(drop=True)
    print("Midway through frameswide. Pivoting time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))


    pivoteddataset = partswide.pivot_table(listcolumns, ['Gesture', 'Sample'], 'Frame', fill_value=0)
    flattened = pd.DataFrame(pivoteddataset.to_records())
    flattened.columns = [fixcolumn('frameswide', col) for col in flattened.columns]
    flattened = flattened.reindex(sortcolumns(3, flattened.columns), axis=1)

    return flattened

                
def padlengths(featuredatasets):
    #Further padding of the finalized dataset 
    maxlencnt = 0
    new_featuredatasets = []#np.array()
    
    for arr in featuredatasets:
        if len(arr) > 0:
          length = max(map(len, arr))
          if maxlencnt < length:
              maxlencnt = length

    for arr2 in featuredatasets:
        new_featuredatasets.append(np.array([xi + [0] * (maxlencnt - len(xi)) for xi in arr2]))

    print("Max Len count" + str(maxlencnt))
    return new_featuredatasets

### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

def checkdatafile(filename):
    return path.exists(path_tempfiles+filename + '_export.csv')

def savedatafile(pd, filename):
    pd.to_csv(r''+path_tempfiles+filename+'_export.csv', index = True, header=True)

def readdatafile(filename):
    return pd.read_csv(r''+path_tempfiles+filename+'_export.csv', index_col = 0, header=0)

def savenumpydata(npdata, filename):
    np.savetxt(r''+path_tempfiles+filename+'_export.csv', npdata, delimiter = ',')

def readnumpydata(filename):
    return np.loadtxt(r''+path_tempfiles+filename+'_export.csv', delimiter=',')

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def createClassifierData(test_data, train_data):
    #splitting out the rows of training and test data into X and Y values
    #could also be done using sklearn
    Y_train = np.array(list(zip(*train_data))[0])
    
    X_train = [item[1 - len(item):] for item in train_data]
    
    Y_test = np.array(list(zip(*test_data))[0]) 
    X_test = [item[1 - len(item):] for item in test_data] 

    #X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size = 0.20, random_state = 42)

    return X_test, Y_test, X_train, Y_train

def preparedata(features, forcenewfile, forcenewfeatures, limit):
    #This function extracts the data from the JSON files, does transforms 
    #into various formats and returns the data
    print("Data Preparation starting time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
    versionnum = "_vpost1"
    #Based on parameters, run through the directory to open the json files and setup the training and test datasets
    if not (checkdatafile("test_original"+versionnum) and forcenewfile == False):
        for (dirpath, dirnames, filenames) in walk(path_testdata):
            ftst.append([dirpath, dirnames, filenames])
        
        for (dirpath, dirnames, filenames) in walk(path_trainingdata2):
            ftrn2.append([dirpath, dirnames, filenames])
            
        for i in range(len(ftst[0][1])):
            if str(ftst[0][1][i]).split('-')[0] in list_gesture:
                dict_tst[str(ftst[0][1][i]).split('-')[0] + "-0"] = json.load(open(ftst[i + 1][0]+'/key_points.json'))
          
        for i in range(len(ftrn2[0][1])):
            if str(ftrn2[0][1][i].upper().split('_')[0]) in list_gesture: 
                for x in range(1, len(ftrn2)):
                    if str(ftrn2[0][1][i].upper().split('_')[0]) in ftrn2[x][0].upper():
                        for cnt in range(len(ftrn2[x][2])):
                            dict_trn2[str(ftrn2[0][1][i].upper().split('_')[0])+"-" + str(cnt)] = json.load(open(ftrn2[x][0]+ "/" + ftrn2[x][2][cnt]))


   
    #Based on the parameters, check if we are to recreate or re-use existing data
    #Next few blocks check parameters and then experiments are done with various normalize functions
    #Normalize to anchor points or normalize based on mean
    print("Data Normalization start time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
    #------------------------------------------
    if checkdatafile("test_normalized"+versionnum) and forcenewfile == False:
        pd_tst = readdatafile("test_normalized"+versionnum)
    else:
        pd_tst1 = createtable(dict_tst)
        #savedatafile(pd_tst1, "test_original" + versionnum)
        #pd_tst = pd_tst1
        #pd_tst = normalize_to_nose(pd_tst1,'TEST')
        pd_tst = normalize_data(pd_tst1, "TEST")
        savedatafile(pd_tst, "test_normalized"+versionnum)
    #------------------------------------------
    if checkdatafile("train_normalized"+versionnum) and forcenewfile == False:
        pd_trn = readdatafile("train_normalized"+versionnum)
    else:
        pd_trn2 = createtable(dict_trn2)
        #savedatafile(pd_trn2, "train_original" + versionnum)
        #pd_trn = pd_trn2
        #pd_trn = normalize_to_nose(pd_trn2, 'TRAIN')
        pd_trn = normalize_data(pd_trn2, "TRAIN")
        savedatafile(pd_trn, "train_normalized"+versionnum)
    #------------------------------------------
    #pd_trn2 = createtable(dict_trn2)
    #pd_trn2 = normalize_data(pd_trn2, "TRAIN")
    #pd_trn = normalize_to_nose(pd_trn, 'TRAIN')
    #------------------------------------------
    #print(pd_trn.head(100))
    
    ##################
    #Re-shape the data as necessary to formulate featuresets
    print("Data Re-Shaping start time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f"))) 
    if checkdatafile("test_partswide"+versionnum) and forcenewfile == False:
        pd_tst_parts = readdatafile("test_partswide"+versionnum)
    else:
        pd_tst_parts = createtable_partswide(pd_tst)
        savedatafile(pd_tst_parts, "test_partswide"+versionnum)

    if checkdatafile("train_partswide"+versionnum) and forcenewfile == False:
        pd_trn_parts = readdatafile("train_partswide"+versionnum)
    else:
        pd_trn_parts = createtable_partswide(pd_trn)
        savedatafile(pd_trn_parts, "train_partswide"+versionnum)

    #Basic review of the data visually to understand and determine existence of patterns
    #print("Printing charts at time - {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
    #printcharts(pd_tst_parts, "test")
    #printcharts(pd_trn_parts, "train")

    ##################
    #Develop featuresets based on function and other parameters chosen
    if checkdatafile("train_features"+versionnum) and forcenewfeatures == False:
        np_tst_parts_features = readnumpydata("test_features"+versionnum)
        np_trn_parts_features = readnumpydata("train_features"+versionnum)
        print("Pulled Re-shaped Data ending time at {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
    else:
        print("Data 1 Re-shaping Test Frames Wide starting time is {}".format(
            datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
        np_tst_parts_features = developfeaturematrix(features, pd_tst_parts, False, limit)
        print("Data 2 Re-shaping Train Frames Wide starting time is {}".format(
            datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
        np_trn_parts_features = developfeaturematrix(features, pd_trn_parts, False, limit)

        #Further Padding of overall datasets to ensure consistent lengths for ML Classifiers
        print("Padding Lengths starting time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
        np_tst_parts_features, np_trn_parts_features = padlengths([np_tst_parts_features, np_trn_parts_features])
        savenumpydata(np_tst_parts_features, "test_features"+versionnum)
        savenumpydata(np_trn_parts_features, "train_features"+versionnum)

    #PCA disabled for the moment - 2020/03/01
    #print("RUN PCA starting time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
    #np_tst_parts_features = runpca(np_tst_parts_features, 60)
    #np_trn_parts_features = runpca(np_trn_parts_features, 60)
    

    return np_tst_parts_features, np_trn_parts_features




# Classifiers

In [None]:
def classify_ann(X_test, Y_test, X_train, Y_train):
    #Using an Artificial Neural Network to classify the data using a 
    #Grid Search to try and find the best hyperparameters

    
    #https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    num_hidden = int(len(X_test[0])/2)

    parameter_space = {
    'hidden_layer_sizes': [(num_hidden, round(num_hidden/2), round(num_hidden/4)), (num_hidden,num_hidden*2,num_hidden), (num_hidden,)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['sgd', 'adam'],
    #'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
    'learning_rate_init': [0.0001, 0.0005,0.001]
    }

    #from sklearn.preprocessing import StandardScaler
    #scaler = StandardScaler()
    #scaler.fit(X_train)
    
    #X_train = scaler.transform(X_train)
    #X_test = scaler.transform(X_test)
    from sklearn.neural_network import MLPClassifier  
    mlp = MLPClassifier(max_iter=800, verbose=True, tol=0.000000100, momentum=0.9,early_stopping=False)
    #37-44% range
    # mlp = MLPClassifier(hidden_layer_sizes=(num_hidden, round(num_hidden/2), round(num_hidden/4)), max_iter=800, activation='logistic', momentum=0.9,early_stopping=False,
    #                     learning_rate='adaptive', verbose=True, tol=0.000000100, learning_rate_init=0.0001)

    clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=2, verbose=1, )
    clf.fit(X_train, Y_train)

    # Best parameters set
    print('Best parameters found:\n', clf.best_params_)

    # All results
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    predictions = clf.predict(X_test)
    pickle.dump(mlp, open("model_ANN.pkl", 'wb'))
    print(classification_report(Y_test,predictions, target_names=list_gesture))
    acc = accuracy_score(Y_test, predictions)
    print("Accuracy: "+str(acc))
    rslt2 = precision_recall_fscore_support(Y_test,predictions)
    
def classify_NB(X_test, Y_test, X_train, Y_train):
    #Using a Naive Bayes classifier to identify the action

    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB().fit(X_train, Y_train)
    gnb_predictions = gnb.predict(X_test)
    pickle.dump(gnb_predictions, open("model_NB.pkl", 'wb'))
    accuracy = gnb.score(X_test, Y_test)

    print(classification_report(Y_test, gnb_predictions, target_names=list_gesture))
    acc = accuracy_score(Y_test, gnb_predictions)
    print("Accuracy: "+str(acc))
    rslt2 = precision_recall_fscore_support(Y_test,gnb_predictions)


def classify_KNN(X_test, Y_test, X_train, Y_train):
    #Using a K Nearest Neighbor classifier to identify the action
    from sklearn import neighbors
    clf = neighbors.KNeighborsClassifier(n_neighbors=2)
    clf.fit(X_train, Y_train)
    knn_predictions = clf.predict(X_test)
    pickle.dump(knn_predictions, open("model_KNN.pkl", 'wb'))

    print(classification_report(Y_test, knn_predictions, target_names=list_gesture))
    acc = accuracy_score(Y_test, knn_predictions)
    print("Accuracy: "+str(acc))
    rslt2 = precision_recall_fscore_support(Y_test,knn_predictions)

def classify_LSTMCNN(X_test, Y_test, X_train, Y_train):
    #Using Keras for a Deep Learning NN to identify the action
    #Was modded from a CNN since we are using the posenet body position results
    #LSTM is still being experimented with as a means to memorize the 
    #sequence without losing data to vanishing gradient
    #IT IS MESSY
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import f1_score
    # fix random seed for reproducibility
    np.random.seed(7)
    
    max_seq_length = 900
    X_train = sequence.pad_sequences(X_train, maxlen=max_seq_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_seq_length)
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)  
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    print(X_train.shape)
    print(X_test.shape)
    print(Y_train.shape)
    print(Y_test.shape)

    #print(Y_train)
    #print(Y_test)
    orig_Y_train = Y_train
    orig_Y_test = Y_test

    Y_train = keras.utils.to_categorical(Y_train, num_classes=6)
    Y_test = keras.utils.to_categorical(Y_test, num_classes=6)
    model = Sequential()
    
    #model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    #model.add(MaxPooling1D(pool_size=2))
    
    model.add(Flatten())

    #model.add(LSTM(128, #return_sequences=True,
    #            input_shape=(max_seq_length, 1)))  # returns a sequence of vectors of dimension 32
    # #model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
    # #model.add(LSTM(32))  # return a single vector of dimension 32
    # #model.add(Dropout(0.2))
    model.add(LSTM(900))
    model.add(Dense(900, activation='relu'))
    model.add(Dense(450, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(200, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='softmax'))
    
    # ####model.add(Flatten())
    # model.add(Dense(100, activation='relu'))
    # model.add(Dropout(0.2))
    # model.add(Dense(32, activation='relu'))
    # model.add(Dropout(0.2))
    # model.add(Dense(6, activation='softmax'))
    #model.add(Activation("softmax"))

    optim = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])
    model.build(X_train.shape)
    print(model.summary())
    print(model.optimizer.get_config())

    
    history = model.fit(X_train, Y_train, epochs=300, batch_size=415, verbose=0)
    yhat_probs = model.predict(X_test, verbose=0)
    yhat_classes = model.predict_classes(X_test, verbose=0)

    print(orig_Y_test)
    print(yhat_classes)
    accuracy = accuracy_score(orig_Y_test, yhat_classes )
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(orig_Y_test, yhat_classes, average='micro' )
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(orig_Y_test, yhat_classes, average='micro' )
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(orig_Y_test, yhat_classes, average='micro' )
    print('F1 score: %f' % f1)
    

# Main

In [None]:
##############################################################
#### Several feature sets are being experimented with, to find the best 
#### combination for prediction accuracy etc

#def main():

#Master List
#features = ['plain', 'fft','mean','std','var','dwt','dwtMulti','rms','max','min','energy', 'zcr', 'mda', 
#'dnrd', 'zmd']
#features = ['plain','zCRMD','fft','dwt','mean','std','var','dwtMulti','rms','max','min','energy'] #36%
#features = [ 'plain', 'fft', 'dwt'] 37%
#features = [ 'fft', 'dwt', ] 45%

#features = [ 'fft', 'dwt']
#features = [ 'zcr','mda', 'energy'] -61 on knn
#features = ['fft','dwt']
features = ['plain']

#The Data is extracted and prepared here
dataarrtst, dataarrtrn = preparedata(features, forcenewfile = True, forcenewfeatures = True, limit = 30)
#The data is simply split into X and Y sets for both the test and training datasets
X_test, Y_test, X_train, Y_train = createClassifierData(dataarrtst, dataarrtrn)

#Various classification algorithms are used here to make predictions for the actions
#We basically compare their results to determine the best model
#LSTM Network is still being experimented with
#print("Classification - ANN starting time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
#classify_ann(X_test, Y_test, X_train, Y_train)
# print("Classification - NB starting time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
# classify_NB(X_test, Y_test, X_train, Y_train)
# print("Classification - KNN starting time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
# classify_KNN(X_test, Y_test, X_train, Y_train)
print("Classification - LSTMCNN starting time is {}".format(datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")))
classify_LSTMCNN(X_test, Y_test, X_train, Y_train)
print("Finished")

# if __name__== "__main__":
#    main()

In [None]:
#Mounting Drive location that will allow loading of JSON files and
#read/write for Transformed Model Data and Model Parameters
from google.colab import drive
drive.mount('/content/drive')


# Bottom