In [90]:
# set of imports
import numpy as np
import os 
import zipfile
import json
import re #regular expressions
import operator

import pandas as pd

from pandas.io.json import json_normalize  

from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

In [91]:
r = requests.get(zip_file_url, stream=True)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
sessions = ['OKDDM1_2018-7-24-11H57M49S371_annotated.zip'] 

In [92]:
dfALL = pd.DataFrame() # Dataframe with all summarised data
dfAnn = pd.DataFrame() # Dataframe containing the annotations

# for each session in the list of sessions
for s in sessions:
    # get current absolute time in seconds. This is necessary to add the delta correctly
    current_time_offset = pd.to_datetime(os.path.getmtime(s),unit='s')
    
    #1. Reading data from zip file
    with zipfile.ZipFile(s) as z:
         # First look for annotation.json
            
        for filename in z.namelist():
            
            if not os.path.isdir(filename):
                
                if '.json' in filename:
                    
                    with z.open(filename) as f:
                         data = json.load(f) 
                    # if it has the 'intervals' then then it is an annotatation file 
                    
                    if 'intervals' in data:
                        
                        # concatenate the data with the intervals normalized and drop attribute 'intervals'
                        df = pd.concat([pd.DataFrame(data), 
                            json_normalize(data['intervals'])], 
                            axis=1).drop('intervals', 1)
                        
                        # convert to numeric (when reading from JSON it converts into object in the pandas DF)
                        # with the parameter 'ignore' it will skip all the non-numerical fields 
                        df = df.apply(pd.to_numeric, errors='ignore')
                        
                        # remove the prefix 'annotations.' from the column names
                        df.columns = df.columns.str.replace("annotations.", "")
                        
                        # from string to timedelta + offset
                        df.start = pd.to_timedelta(df.start) + current_time_offset
                        
                        # from string to timedelta + offset
                        df.end = pd.to_timedelta(df.end) + current_time_offset
                        
                        # duration as subtractions of delta in seconds
                        df['duration'] = (df.end-df.start) / np.timedelta64(1, 's')   
                        
                        # append this dataframe to the dataframe annotations
                        dfAnn = dfAnn.append(df) 
                        
                    # if it has 'frames' then it is a sensor file 
                    elif 'frames' in data:
                        
                        # concatenate the data with the intervals normalized and drop attribute 'frames'
                        df = pd.concat([pd.DataFrame(data), 
                            json_normalize(data['frames'])], 
                            axis=1).drop('frames', 1)
                        
                        # remove underscore from columnfile e.g. 3_Ankle_Left_X becomes 3AnkleLeftX
                        df.columns = df.columns.str.replace("_", "")
                        
                        # from string to timedelta + offset
                        df['frameStamp']= pd.to_timedelta(df['frameStamp']) + session_abs_time
                        
                        # retrieve the applicaiton name
                        appName = df.applicationName.all()
                        
                        # remove the prefix 'frameAttributes.' from the column names
                        df.columns = df.columns.str.replace("frameAttributes", df.applicationName.all())
                        
                        # set the timestamp as index 
                        df = df.set_index('frameStamp').iloc[:,2:]
                        
                        # exclude duplicates (taking the first occurence in case of duplicates)
                        df = df[~df.index.duplicated(keep='first')]
                        
                        # convert to numeric (when reading from JSON it converts into object in the pandas DF)
                        # with the parameter 'ignore' it will skip all the non-numerical fields 
                        df = df.apply(pd.to_numeric, errors='ignore')
                        
                        # Keep the numeric types only (categorical data are not supported now)
                        df = df.select_dtypes(include=['float64','int64'])
                        
                        # Remove columns in which the sum of attributes is 0 (meaning there the information is 0)
                        df = df.loc[:, (df.sum(axis=0) != 0)]
                        
                        # The application KienctReader can track up to 6 people, whose attributes are 
                        # 1ShoulderLeftX or 3AnkleRightY. We get rid of this numbers assuming there is only 1 user
                        # This part has to be rethinked in case of 2 users
                        df.rename(columns=lambda x: re.sub('Kinect.\d','Kinect.',x),inplace=True)
                        
                        # Concate this dataframe in the dfALL and then sort dfALL by index
                        dfALL = pd.concat([dfALL, df], ignore_index=False,sort=False).sort_index()
                        
                        
dfALL =  dfALL.apply(pd.to_numeric).fillna(method='bfill')
print dfALL

                               Kinect.AnkleLeftX  Kinect.AnkleLeftY  \
frameStamp                                                            
2018-07-24 20:37:10.650434264                0.0                0.0   
2018-07-24 20:37:10.653936264                0.0                0.0   
2018-07-24 20:37:10.655450364                0.0                0.0   
2018-07-24 20:37:10.656437764                0.0                0.0   
2018-07-24 20:37:10.656938164                0.0                0.0   
2018-07-24 20:37:10.685958064                0.0                0.0   
2018-07-24 20:37:10.721497564                0.0                0.0   
2018-07-24 20:37:10.752504864                0.0                0.0   
2018-07-24 20:37:10.785027564                0.0                0.0   
2018-07-24 20:37:10.820054264                0.0                0.0   
2018-07-24 20:37:10.853076164                0.0                0.0   
2018-07-24 20:37:10.885100064                0.0                0.0   
2018-0

In [93]:
# Exclude features
to_exclude = ['Ankle']
for el in to_exclude:
    df1 = dfALL[[col for col in df1.columns if el not in col]]
    
# Feature aggregators
aggregations = ['max','min','std','mean','var','median']

# The masked_df allows to select intervals based on the annotaton 
masked_df = [
    df1[(df2_start <= df1.index) & (df1.index <= df2_end)]
    for df2_start, df2_end in zip(dfAnn['start'], dfAnn['end'])
]

features = []

# For each attribute
for key in df1.columns.values:
    # For each function function aggregation
    for a in aggregations:
        # the name is attribute.aggregation 
        fname = key+'.'+a
        # append it as a feature
        features.append(fname)
        # apply the aggregation function 
        if a == 'max':
            dfAnn[fname] = [np.max(dt[key]) if not dt.empty else None for dt in masked_df]
        elif a == 'min':
            dfAnn[fname] = [np.min(dt[key]) if not dt.empty else None for dt in masked_df]
        elif a == 'std':    
            dfAnn[fname] = [np.std(dt[key]) if not dt.empty else None for dt in masked_df]
        elif a == 'mean':    
            dfAnn[fname] = [np.mean(dt[key]) if not dt.empty else None for dt in masked_df]
        elif a == 'var':
            dfAnn[fname] = [np.var(dt[key]) if not dt.empty else None for dt in masked_df]
        elif a == 'median':  
            dfAnn[fname] = [np.median(dt[key]) if not dt.empty else None for dt in masked_df]

In [None]:
# The target features set
target_features = ['classRate','classDepth','classRelease']
# For each target feature in the feature set
for target in target_features:
    
    # Scaling features  http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
    
    # Instantiating the model
    min_max_scaler = preprocessing.MinMaxScaler()
    
    # the domain is the valeus of the features 
    X = dfAnn[features].values

    # Scaling features 
    X = min_max_scaler.fit_transform(X)
    
    # Range (target feature)
    y = dfAnn[target].values.ravel()
    
    # ExtraTreeClassifier for feature importance
    #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
    ETF = ExtraTreesClassifier()
    ETF.fit(X, y)
    # Just sorting the dictionary by the feature with highest to the lowest importance
    importance = sorted(dict(zip(dfAnn[features].columns, ETF.feature_importances_)).items(), 
                        key=operator.itemgetter(1),reverse=True)
    # Classification with SVM for classification linear kernel  
    svc = SVC(kernel="linear", C=1)
    accuracies = []
    
    # Try from 1 to n_features to check the best number of features 
    for n in range(1,len(importance)):
        
        training_feautres = []
        
        # For each element in the feature set ordered for importance
        for el in importance[:n]:
            training_feautres.append(el[0])
            
        # Get the value of these featues PLUS the target
        dfAnn[training_feautres+[target]]
        
        # The domain is composed by the Scaled training featues
        X = min_max_scaler.fit_transform(dfAnn[training_feautres].values)
        
        # The range is target values
        y = dfAnn[target].values.ravel()
        
        # Set the support vector model with test size at 33 % and a random seed
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=72)
        svc.fit(X_train, y_train)
        # Prediction
        y_pred = svc.predict(X_test)
        # Append to the accuracy list
        
        accuracies.append(accuracy_score(y_test, y_pred))
            
    # Plot the figure of the accuracy
    fig = plt.figure()
    plt.plot(accuracies)
    fig.suptitle('SVM '+target+' N='+str(np.shape(dfAnn)[0]), fontsize=20)
    plt.xlabel('No. features', fontsize=18)
    plt.ylabel('Accuracy', fontsize=16)
    plt.show()