# 1) Data Prep

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
cwd = os.getcwd()

counter = 0
print(counter, cwd)
while not cwd.endswith("drone_steering") and counter < 10:
    os.chdir('..')
    cwd = os.getcwd()
    counter = counter + 1
    print(counter, cwd)

0 /Users/lsafari/drone_steering/models/playground
1 /Users/lsafari/drone_steering/models
2 /Users/lsafari/drone_steering


## a) Data loading

In [None]:
from app_local.module import DataEnsembler, GestureTransformer

de = DataEnsembler(120)
de.investigate_available_datafiles(data_dir='data/gesture/',is_frame_based=True)
de.load_data()

## b) Data rescaling to actual video length

In [None]:
de.rescale_data_frames(time_of_first_frame='avg',verbose=True)

## c) Interpolate and adjust framebased labels

In [None]:
de.interpolate_and_convert_framebased_labels(new_frmlen=50,verbose=True)

## d) Generate training data

In [None]:
de.assemble_data()

In [None]:
de.display_information()

## e) Data normalization

In [None]:
X = de.X.copy()
y = de.y.copy().astype("int32")
gt = GestureTransformer(byrow=True, feature_names= list(de.feature_names))
X = gt.transform(X, verbose = True)

# 2) Model Training

## a) Filtering 0-Labels

In [None]:
# number of 0-labels to use for traing
n = 750


# only select certain indeces to prevent too many 0-labeled instances
idx = []
for i in sorted(set(y)):
    idx.append(np.where(np.isclose(y,i))[0])

print("----- labels summary before --------------------------------")
for i in sorted(set(y)):
    print(i,len(idx[i]))
print("")
    
zero_idx = np.random.choice(idx[0], n, replace=False)
keep_idx = np.concatenate([zero_idx,idx[1],idx[2],idx[3],idx[4],idx[5],idx[6]])
keep_idx = sorted(keep_idx)

y = y[keep_idx]
X = X[keep_idx]
print("Shapes after:",y.shape, X.shape) 
print("")

idx = []
for i in sorted(set(y)):
    idx.append(np.where(np.isclose(y,i))[0])

print("----- labels summary after --------------------------------")
for i in sorted(set(y)):
    print(i,len(idx[i]))
print("")   

## b) OneHot-Encoding of target vector

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder(sparse = False)
print(type(ohe))
target = ohe.fit_transform(y.reshape(-1,1))
print("Before:", y.shape)
print("After:", target.shape)
target[0:5,:]

## c) Train-Test-Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=10)
print("Training Data:", x_train.shape, y_train.shape)
print("Test Data:", x_test.shape, y_test.shape)

## d) Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

### Train a simple Random Forest

In [None]:
model = RandomForestClassifier(
    n_estimators = 100, # 100
    criterion = "gini", # {gini, entropy}
    max_depth = None , # None
    n_jobs = -1,
    verbose = 1
)

model.fit(x_train.reshape(x_train.shape[0],-1),y_train)

#### Training accuracy is already 100%

In [None]:
preds = model.predict(x_train.reshape(x_train.shape[0],-1))
(preds == y_train).all(axis = 1).sum()/y_train.shape[0]

#### Test accuracy is 94%

In [None]:
preds = model.predict(x_test.reshape(x_test.shape[0],-1))
(preds == y_test).all(axis = 1).sum()/y_test.shape[0]

### Gridsearch with Crossvalidation

In [None]:
# Number of trees in random forest
n_estimators = [100, 500, 1000]
# Number of features to consider at every split
max_features = [5, 10, 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 110, num = 4)]
max_depth.append(None)

# Create the random grid
random_grid = {'randomforestclassifier__n_estimators': n_estimators,
               'randomforestclassifier__max_features': max_features,
               'randomforestclassifier__max_depth': max_depth,
              }

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              }

print(random_grid)

In [None]:
model.get_params().keys()

#### Training takes quite a long time - do not needlessly repeat it

In [None]:
params = random_grid
scoring = "accuracy"
clf = GridSearchCV(model, params, n_jobs = -1, verbose = 10, scoring = scoring)
clf.fit(x_train.reshape(x_train.shape[0],-1),y_train)

In [None]:
clf.cv_results_.keys()

In [None]:
clf.cv_results_["mean_test_score"]

#### Determine performance of best Gridsearch-Model on the Holdout-Set

In [None]:
preds = clf.best_estimator_.predict(x_test.reshape(x_test.shape[0],-1))
(preds == y_test).all(axis = 1).sum()/y_test.shape[0]