In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV
plt.style.use('ggplot')

%matplotlib inline

In [159]:
"""
Rading data
"""
data = pd.read_csv("data/initial_data.csv", header=0)
label_col = "DriveTrain"
classes = data[label_col].unique()
categorical_features = [col for col in data.columns if data[col].dtype.name == 'object']
numerical_features = [col for col in data.columns if data[col].dtype.name != 'object']

# Preprocessing

In [160]:
"""
Filling N/A
"""
data_processed = data.fillna(data.median(axis=0), axis=0)
data_describe = data.describe(include=[object])
for col in categorical_features:
    data_processed[col] = data_processed[col].fillna(data_describe[c]['top'])

In [161]:
"""
Normalization of numerical data
"""
data_numerical = data_processed[numerical_features]
data_numerical = (data_numerical - data_numerical.mean()) / data_numerical.std()

for col in data_numerical:
    data_processed[col] = data_numerical[col]


In [162]:
"""
Vectorization
"""
binary_features    = [col for col in categorical_features if data_describe[c]['unique'] == 2]
nonbinary_features = [col for col in categorical_features if data_describe[c]['unique'] > 2]

for col in binary_features[1:]:
    top = data_describe[col]['top']
    top_items = data_processed[col] == top
    data_processed.loc[top_items, col] = 0
    data_processed.loc[np.logical_not(top_items), col] = 1
    
    
data_not_binary = pd.get_dummies(data_processed[nonbinary_features])

for col in data_not_binary:
    data_processed[col] = data_not_binary[col]

data_processed = data_processed.drop(nonbinary_features, axis=1)


In [116]:
"""
Dimensionality reduction
"""
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(data_processed)
data_pca = pca.transform(data_processed)

In [129]:
"""
kNN, k-Fold
"""
from sklearn.neighbors import KNeighborsClassifier

X = data_pca
y = data[label_col]

n_neighbors_array = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15]
best_n_neighbors_vals = []

kf = KFold(n_splits=2)
for train_index, test_index in kf.split(X):    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    knn_clf = KNeighborsClassifier()
    grid = GridSearchCV(knn_clf, param_grid={'n_neighbors': n_neighbors_array})
    grid.fit(X_train, y_train)
    best_n_neighbors = grid.best_estimator_.n_neighbors
    best_n_neighbors_vals.append(best_n_neighbors)
    scores = cross_val_score(grid, X_test, y_test, cv=5)
print("Values of best parameters:")
print(best_n_neighbors_vals)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))




Values of best parameters:
[3, 9]
Accuracy: 0.68 (+/- 0.08)


In [145]:
"""
kNN, stratified k-Fold
"""

from sklearn.neighbors import KNeighborsClassifier

X = data_pca
y = data[label_col]

n_neighbors_array = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
best_n_neighbors_vals = []

skf = StratifiedKFold(n_splits=2)
for train_index, test_index in skf.split(X, y):    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    knn_clf = KNeighborsClassifier()
    grid = GridSearchCV(knn_clf, param_grid={'n_neighbors': n_neighbors_array})
    grid.fit(X_train, y_train)
    best_n_neighbors = grid.best_estimator_.n_neighbors
    best_n_neighbors_vals.append(best_n_neighbors)
    scores = cross_val_score(knn_clf, X_test, y_test, cv=5)
print("Values of best parameters:")
print(best_n_neighbors_vals)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Values of best parameters:
[5, 11]
Accuracy: 0.74 (+/- 0.07)


In [150]:
"""
Decision tree, stratified k-Fold
"""

from sklearn import tree 

X = data_pca
y = data[label_col]


skf = StratifiedKFold(n_splits=2)
for train_index, test_index in skf.split(X, y):    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    tree_clf = tree.DecisionTreeClassifier(max_depth=5, random_state=17)    
    tree_clf.fit(X_train, y_train)    
    scores = cross_val_score(tree_clf, X_test, y_test, cv=5)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.72 (+/- 0.17)


In [141]:
"""
Random forest, stratified k-Fold
"""

from sklearn import ensemble

X = data_pca
y = data[label_col]



rf_clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=11)


skf = StratifiedKFold(n_splits=2)
for train_index, test_index in skf.split(X, y):    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf_clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=11)    
    rf_clf.fit(X_train, y_train)    
    scores = cross_val_score(rf_clf, X_test, y_test, cv=5)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.73 (+/- 0.17)


In [164]:
"""
Random forest, stratified k-Fold, feature selection
"""

from sklearn import ensemble

X = data_pca
y = data[label_col]

rf_clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=11)


skf = StratifiedKFold(n_splits=2)
for train_index, test_index in skf.split(X, y):    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf_clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=11)    
    rf_clf.fit(X_train, y_train)    
    scores = cross_val_score(rf_clf, X_test, y_test, cv=5)
    
feature_names = data.columns
    
importances = rf_clf.feature_importances_
print(importances)
indices = np.argsort(importances)[::-1]

print("Feature importances:")
for f, idx in enumerate(indices):
    print("{:2d}. feature '{:5s}' ({:.4f})".format(f + 1, feature_names[idx], importances[idx]))

[ 0.51948112  0.48051888]
Feature importances:
 1. feature 'Manufacturer' (0.5195)
 2. feature 'Model' (0.4805)


In [165]:
"""
GBT, stratified k-Fold
"""
from sklearn import ensemble

X = data_pca
y = data[label_col]


skf = StratifiedKFold(n_splits=2)
for train_index, test_index in skf.split(X, y):    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gbt_clf = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=11)
    gbt_clf.fit(X_train, y_train)
    scores = cross_val_score(gbt_clf, X_test, y_test, cv=5)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.74 (+/- 0.22)
