In [1]:
import os 
import cv2
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, cross_val_predict, \
                                    GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,\
                             AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

import pprint


In [2]:
ls

[0m[01;34mframes[0m/                    README.txt                             Task1.ipynb
[01;32mlunar_lander_data_gen.py[0m*  state_vectors_2018-04-21-20-41-25.csv


In [3]:
#Plug in data file

#Other
#data_file = "state_vectors_....etc.csv"

#Andy
data_file = "state_vectors_2018-04-21-20-41-25.csv"

#Conor
#data_file = "../state_vectors_2018-04-20-15-12-38.csv"

sample_nr_rows = 6000
grid_search_cv_folds = 5
cv_experiment_folds = 10

In [4]:
# load dataset
df = pd.read_csv(data_file)

# take sample of required size
sample_df = df.sample(sample_nr_rows)

In [5]:
# split data into X,y (descriptive features, target feature)
X,y = sample_df[[col for col in list(sample_df) if col!='action']], sample_df['action']

In [6]:
# choose 3 classifiers to go in here
clfs = [RandomForestClassifier]

In [7]:
# dictionary for grid search parameters
param_grids= {
    'RandomForestClassifier': {
        'criterion' : ['gini', 'entropy'],
        'n_estimators' : [5,10,15],
        'max_features' : [None, 5, 8]
    }
}

In [8]:
# dictionary to store the best set of parameters and the best score for each clf
tuned_clfs={}

# running grid search for each clf
for clf in clfs:
    clf_name = str(clf).split(".")[-1][:-2]
    current_GS = GridSearchCV(clf(), 
                              param_grids[clf_name],
                              cv=grid_search_cv_folds,
                              verbose=0, 
                              return_train_score=True).fit(X,y)
    
    # store best params and best score in a sub dictionary for clf
    tuned_clfs[clf_name] = {
                        'best_params' : current_GS.best_params_,
                        'best_score' : current_GS.best_score_
                    }

In [9]:
# print the best params and score for each clf
pprint.pprint(tuned_clfs)

{'RandomForestClassifier': {'best_params': {'criterion': 'gini',
                                            'max_features': 8,
                                            'n_estimators': 15},
                            'best_score': 0.82783333333333331}}


In [10]:
# cross validation experiment for clfs using hyper-params from above
cv_scores = {}
for clf in clfs:
    clf_name = str(clf).split(".")[-1][:-2]
    params = tuned_clfs[clf_name]['best_params']
    cv_scores[clf_name] = np.mean(cross_val_score(clf(**params), X, y, cv=cv_experiment_folds)) 


In [11]:
# print the avg cross validation scores for the clfs using their best params
pprint.pprint(cv_scores)

{'RandomForestClassifier': 0.83115776288868237}
