## Feed Unprocessed Data into Classifiers, Score, and Measure Accuracy

In [87]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [88]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV


### Load the Data from Pickled DataFrames

In [89]:
pwd

'/home/jovyan/madelon/ipynb'

In [114]:
cook_total_sample = pd.read_pickle('../assets/pickled_samples/cook_total_samples.p')
madelon_train_sample = pd.read_pickle('../assets/pickled_samples/madelon_sample_train.p')
madelon_train_sample_label = pd.read_pickle('../assets/pickled_samples/madelon_sample_train_labels.p')

**Madelon:** It's not necessary to load in the test set since that's the hold out data to test the classification model's accuracy. Train/test/split on the training data. 


### Run the Data through the Classifiers and obtain Train & Test scores

#### Madelon Dataset

In [115]:
madelon_train_sample.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
138,479,438,533,490,485,479,493,479,504,483,...,477,472,495,564,474,516,482,469,574,482
472,478,476,514,491,619,471,565,475,482,475,...,465,480,469,745,455,460,474,475,548,550
1422,481,492,475,479,527,480,508,474,480,475,...,477,482,501,603,433,524,476,490,467,532
162,472,424,493,478,464,479,504,474,493,471,...,450,475,482,520,517,527,476,515,501,504
277,473,435,534,485,504,482,483,475,488,480,...,484,479,532,645,513,549,472,497,536,448


In [116]:
madelon_train_sample.shape

(600, 500)

In [117]:
madelon_train_sample_label.shape

(600,)

In [118]:
mad_X_train, mad_X_test, mad_y_train, mad_y_test = train_test_split(madelon_train_sample,\
                                                                    madelon_train_sample_label)

In [119]:
display(mad_X_train.shape)
display(mad_X_test.shape)
display(mad_y_train.shape)
display(mad_y_test.shape)

(450, 500)

(150, 500)

(450,)

(150,)

#### Madelon Dataset (Raw Benchmarking without any Preprocessing)
Uses the out of the box default parameters provided by `sklearn` for the selected classification models.

In [120]:
names_of_classifiers = ['LogisticRegression', 'KNeighbors', 'DecisionTree', 'SVClassifier']

classifiers = [
    LogisticRegression(n_jobs=-1, random_state=42),
    KNeighborsClassifier(n_jobs=-1),
    DecisionTreeClassifier(random_state=42),
    SVC(random_state=42)]

Store the results in a dictionary to subsequenty be able to throw the results to compare into a pandas DataFrame

In [123]:
mad_raw_test_scores = {}
mad_raw_train_scores = {}
mad_raw_y_preds = {}

for name, clfr in zip(names_of_classifiers, classifiers):
    clfr.fit(mad_X_train, mad_y_train)
    
    train_score = clfr.score(mad_X_train, mad_y_train)
    test_score = clfr.score(mad_X_test, mad_y_test)
    y_pred = clfr.predict(mad_X_test)
    
    mad_raw_train_scores[name] = train_score
    mad_raw_test_scores[name] = test_score
    mad_raw_y_preds[name] = y_pred
    

In [124]:
mad_raw_test_scores

{'DecisionTree': 0.59999999999999998,
 'KNeighbors': 0.71333333333333337,
 'LogisticRegression': 0.62,
 'SVClassifier': 0.64666666666666661}

In [125]:
mad_raw_train_scores

{'DecisionTree': 1.0,
 'KNeighbors': 0.82222222222222219,
 'LogisticRegression': 1.0,
 'SVClassifier': 1.0}

In [126]:
mad_raw_y_preds

{'DecisionTree': array([ 1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,
        -1, -1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1, -1,  1,  1, -1,  1,
        -1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1,
        -1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1,  1,  1, -1, -1,  1,
        -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,
        -1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,
         1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,
        -1,  1,  1,  1,  1,  1,  1, -1, -1,  1, -1,  1, -1, -1, -1, -1,  1,
         1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1]),
 'KNeighbors': array([-1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1,  1,
        -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
         1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1,  1,
        -1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1, -1, 

#### Cook Dataset

In [101]:
cook_total_sample.head()

Unnamed: 0,_id,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999,target
0,116031,-0.063592,-0.935132,-0.788636,2.006542,0.057752,-0.612374,-0.31929,-0.130704,-0.426335,...,0.079754,-0.609663,1.101417,-0.485404,0.085902,-0.780068,0.155906,0.241406,0.538386,1
1,24415,-0.452243,0.258384,0.620509,0.38908,-0.197159,0.829617,-0.059411,0.910375,-0.323078,...,-0.634202,0.556551,2.037437,-0.4826,-1.418812,0.0792,-0.368648,0.219643,-0.10873,1
2,115872,1.073645,-1.01595,-0.355322,0.452687,-0.744907,-0.776871,0.385545,0.576864,-0.339835,...,-0.270593,0.25033,0.173127,-0.67309,-0.450532,1.538424,0.276987,-0.257989,-0.351097,1
3,62456,-0.269215,1.790995,-0.171136,0.258013,-0.215587,-0.516337,-0.228766,-0.446238,0.41839,...,0.7739,-0.321531,0.847676,-1.532333,-0.613422,-1.498944,-1.059311,0.628973,-0.830657,0
4,173909,0.398804,0.579328,-0.905363,-0.12414,-0.545298,0.409123,-0.179135,0.275275,-0.253539,...,-0.643034,-0.752793,0.176453,0.234722,1.122761,-1.139794,1.231819,-0.783419,1.448478,1


In [105]:
cook_target = cook_total_sample['target']
cook_features = cook_total_sample.drop(['_id', 'target'], axis=1)

In [106]:
display(cook_target.shape)
display(cook_features.shape)

(6600,)

(6600, 1000)

In [107]:
cook_X_train, cook_X_test, cook_y_train, cook_y_test = train_test_split(cook_features, cook_target)

In [None]:
def calculate_log_loss (y_true, y_pred):
    