## Feed RAW Data into Classifiers, Score, and Measure Accuracy

In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV


### Load the Data from Pickled DataFrames

In [10]:
cook_sample = pd.read_pickle('../assets/cook_sample.p')
madelon_train10 = pd.read_pickle('../assets/madelon_train_10.p')
madelon_train_label10 = pd.read_pickle('../assets/madelon_train_label10.p')

**Madelon:** It's not necessary to load in the test set since that's the hold out data to test the classification model's accuracy. Train/test/split on the training data. 


### Run the Data through the Classifiers and obtain Train & Test scores

#### Madelon Dataset

In [8]:
madelon_train10.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
388,477,466,531,486,459,484,506,477,491,494,...,454,476,490,410,533,498,477,481,524,483
414,483,503,549,485,524,487,434,479,497,470,...,481,481,493,685,502,447,484,473,527,509
710,479,489,473,477,483,473,404,474,493,478,...,485,483,498,326,475,507,490,474,504,467
1165,483,491,471,485,494,474,490,477,470,479,...,493,476,464,281,527,487,491,468,467,485
1743,477,442,559,481,529,484,496,478,495,469,...,467,474,513,539,516,490,475,492,513,510


In [24]:
madelon_train10.shape

(200, 500)

In [23]:
madelon_train_label10.shape

(200,)

In [37]:
mad_X_train, mad_X_test, mad_y_train, mad_y_test = train_test_split(madelon_train10,\
                                                                    madelon_train_label10)

In [38]:
display(mad_X_train.shape)
display(mad_X_test.shape)
display(mad_y_train.shape)
display(mad_y_test.shape)

(150, 500)

(50, 500)

(150,)

(50,)

The below methodology was borrowed from sklearn's classifier comparison but applied to my dataset:

http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [40]:
names_of_classifiers = ['LogisticRegression', 'KNeighbors', 'DecisionTree', 'SVClassifier']

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_jobs=-1),
    DecisionTreeClassifier(),
    SVC()]

Store the results in a dictionary to subsequenty be able to throw the results to compare into a pandas DataFrame

In [52]:
raw_test_scores = {}

for name, clfr in zip(names_of_classifiers, classifiers):
    clfr.fit(mad_X_train, mad_y_train)
    test_score = clfr.score(mad_X_test, mad_y_test)
    raw_test_scores[name] = test_score

In [54]:
raw_test_scores

{'DecisionTree': 0.44,
 'KNeighbors': 0.68000000000000005,
 'LogisticRegression': 0.64000000000000001,
 'SVClassifier': 0.38}