# Virtual Piano Tutor

## Project Description

This is a notebook for tracking my progress on VPT...

- Best Classifier as of 11/30
    - SVM {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}

## TODO List

- TODAY
    - Decide on RDF model to keep for rest of project
    - Work on RDF data and annotations
    - Add results to file
    - Rewrite RDF for GridSearchCV
        - Extend RDF
    - Work on ideas for paper
        - Visualizations
    - Play with CAE
    - How to automate this...
    - Windowing/Summarizing
    
- DONE
    - ~~Organize RDF data~~
    - ~~Generate data from already extracted hands...~~
    - ~~Get notebook running on Compute Canada~~
    - ~~Get data on Compute Canada~~
    - ~~Setup CAE to deal with hand images~~
    - ~~setup data for training autoencoder on LH and RH~~
    - ~~Train Autoencoder for LH and Rh~~
    

- Bad Segmentation
    - p3c - left hand (not terrible)
    - p1s - right hand (shouldn't use)
    - p5a - Both could use some work but still caputures most of the left hand (RH not so good...)
    - p5c - not good (left hand passable...)
    
- Add noise to CAE
    - http://scikit-image.org/docs/dev/api/skimage.util.html#random-noise
    
- ~~Multiple Participants~~
    - ~~have one holdout set participant~~
        - ~~Test with p1&2 training p3 testing, then p1&3...~~
    - ~~have one holdout set exercise~~

- Test with RH too

- Windowing data
    - Summarize data for classification
    - Majority Voting (or with probabilities)

- Look for other features
    - Others??
    - ~~Autoencoder features~~
    - ~~HONV~~
    
- Work on hand segmentation
   - See p1e for bad examples
   - How to validate segmentation?
       - Statistical analysis on length and width ratios
       
- Visualize !!!
    - Input 
    - Results !!!
        - F Scores
        - Accuracy
        - Try weighted instead of macro




- Finish Project Description

- ~~Turn into functions~~
    
- ~~Verify Segmentation~~
    - have only done basic verification
    
- ~~FIRST THING: Test by ignoring training data (p1s) and then using train_test_split on recordings~~
    - ~~Data should be ready for spliting~~
    
- ~~Remove data from testing to find culprit~~
    
- ~~Track my progress better !!! (duh through notebooks!)~~

# Setup

## Libraries

In [1]:
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import cv2

from vpt.features.features import *
import vpt.utils.image_processing as ip
import vpt.settings as s
import vpt.hand_detection.depth_context_features as dcf

%load_ext autoreload
%autoreload 2

## Some helper functions

### Load/Save Data Set

In [2]:
def load_data(testing_p, M, radius, feature_type, data_type):
    base = "data/posture/extracted/"
    data_path = os.path.join(base, "{}_M{}_rad{:0.2f}_{}_".format(testing_p, M, radius, feature_type))
    data = np.load(data_path + data_type + "_data_combined.npz")    
    return data

## Project Setup

### Generate or Load Data

In [3]:
M = 5
radius = .15
feature_type = "hog3"

participants = ["p1", "p3", "p4", "p6"]

In [4]:
#### Load data for a single paricipant
data = load_data("all_participants", M, radius, feature_type, "train")
print("X LH", data["X_lh"].shape, "y LH", data["y_lh"].shape, data["vis_lhs"].shape)
print("X RH", data["X_rh"].shape, "y RH", data["y_rh"].shape, data["vis_rhs"].shape)
print("Filenames", data["filenames"].shape)

X_lh, y_lh = data["X_lh"], data["y_lh"]
X_rh, y_rh = data["X_rh"], data["y_rh"]
filenames = data["filenames"]

X LH (15822, 693) y LH (15822,) (15822, 180, 120)
X RH (15822, 693) y RH (15822,) (15822, 180, 120)
Filenames (15822,)


In [5]:
r = re.compile('p[\d]s')

# remove p#s data
vmatch = np.vectorize(lambda x:bool(r.search(x)))
rem_static = vmatch(filenames)

X_lh, y_lh, filenames = X_lh[~rem_static], y_lh[~rem_static], filenames[~rem_static]
X_rh, y_rh = X_rh[~rem_static], y_rh[~rem_static]

# Classification

### Libraries

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, LeaveOneGroupOut, LeavePGroupsOut, GroupKFold
from imblearn.pipeline import Pipeline

from sklearn.decomposition import PCA

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

## Load Data for Classification

## Model Testing

### Hyper Tuning

In [None]:
## Parameters for SVMs
# steps = [('SMOTE', SMOTE()), ("SVC", SVC())]
# param_grid = [
# #   {'SVC__C': [1, 10, 100], 'SVC__kernel': ['linear'], 'SMOTE__kind': ['regular', 'borderline1', 'borderline2', 'svm']},
# #   {'SVC__C': [1, 10, 100], 'SVC__gamma': [.0001, .001, .01, .1], 'SVC__kernel': ['rbf'], 'SMOTE__kind': ['regular', 'borderline1', 'borderline2', 'svm']},
# #   {'SVC__C': [10, 100], 'SVC__gamma': [.000005, .00001, .00005,], 'SVC__kernel': ['rbf'], 'SMOTE__kind': ['regular', 'borderline1', 'borderline2']},
#  ]

steps = [('SMOTE', SMOTEENN()), ("SVC", SVC())]
param_grid = [
  {'SVC__C': [1, 10, 100], 
   'SVC__gamma': [.001, .01, .1], 
   'SVC__kernel': ['rbf'],
   'SMOTE__smote': [SMOTE(kind='borderline1'), SMOTE(kind='borderline2'), SMOTE(kind='svm')]}
] 

# param_grid = [
#   {'SVC__C': [1, 10, 100], 
#    'SVC__gamma': [.001, .01, .1], 
#    'SVC__kernel': ['rbf'],
#    'SMOTE__kind': ['borderline1', 'borderline2', 'svm']}
# ] 

pipeline = Pipeline(steps)

scoring = ['recall_macro', 'accuracy']
logo = GroupKFold(n_splits=3)

In [None]:
# Hyper Parameter Tuning


results = {}

for testing_p in participants:
    
    r = re.compile(testing_p)
    vmatch = np.vectorize(lambda x:bool(r.search(x)))
    val_p = vmatch(filenames)

    X_lh_p = X_lh[val_p]
    y_lh_p = y_lh[val_p]
    X_rh_p = X_rh[val_p]
    y_rh_p = y_rh[val_p]
    filenames_p = filenames[val_p]

    X_comb = np.vstack((X_lh_p, X_rh_p))
    y_comb = np.hstack((y_lh_p, y_rh_p))
    filenames_comb = np.hstack((filenames_p, filenames_p))

    print("{} Data:".format(testing_p))
    print("LH:", X_lh_p.shape, y_lh_p.shape)
    print("RH:", X_rh_p.shape, y_rh_p.shape)
    print(filenames_p.shape)
    print("Comb:", X_comb.shape, y_comb.shape)
    print(filenames_comb.shape)
    
    
    group_size = 30
    groups_samp = np.zeros_like(filenames_comb, dtype=int)

    for i in range(len(filenames_comb)//group_size):
        groups_samp[i*group_size:i*group_size+group_size] = i

    groups_samp[len(groups_samp)%group_size * -1:] = i+1

#     print(np.unique(groups_samp))

    for score in scores:

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print("## Tuning hyper-parameters for {}".format(score))
            print()

#             if score is "accuracy":
#                 scoring = score
#             else:
#                 scoring = '{}_weighted'.format(score)      

            #### TRAIN COMBINED LH & RH
            clf_comb = GridSearchCV(pipeline, param_grid, cv=logo.split(X_comb, y_comb, groups=groups_samp), scoring=scoring, n_jobs=2, verbose=1, refit="recall_macro")
            clf_comb.fit(X_comb, y_comb)

            print("Best Combined Parameters set found on data set:")
            print()
            print(clf_comb.best_params_)
            print()
            print("Grid scores on data set:")
            print()
            means_acc = clf_comb.cv_results_['mean_test_accuracy']
            stds_acc  = clf_comb.cv_results_['std_test_accuracy']
            means_recall = clf_comb.cv_results_['mean_test_recall_macro']
            stds_recall = clf_comb.cv_results_['std_test_recall_macro']
            for mean_acc, std_acc, mean_recall, std_recall, params in zip(means_acc, stds_acc, means_recall, stds_recall, clf_comb.cv_results_['params']):
                print("Recall: %0.3f (+/-%0.3f) - Accuracy: %0.3f (+/-%0.3f) for\n %r" % (mean_recall, std_recall, mean_acc, std_acc, params))
                print()
            print()
            
            results[testing_p] = clf_comb.cv_results_

In [None]:
import pandas as pd

In [None]:
results_p1 = pd.DataFrame(results['p1'])

In [None]:
results_df = pd.DataFrame(results)

In [None]:
plt.plot(results_p1["mean_test_recall_macro"])

In [None]:
plt.plot(results_p1["mean_test_accuracy"])

In [None]:
pickle.dump(results, open("results_0308.pkl", 'wb'))

In [None]:
r = pickle.load(open("results_0308.pkl", 'rb'))