In [None]:
import os
import math
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, classification_report
from sklearn.decomposition import PCA, SparsePCA
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer, Normalizer, StandardScaler, scale
from sklearn.externals import joblib

In [None]:
np.random.seed(seed=2323)

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/romeokienzler/developerWorks/master/train.csv")

In [None]:
df.shape

In [None]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

In [None]:
zeros = df.apply(lambda x: not all(x == 0), axis = 0)
df = df[zeros.index[zeros]]
X = df.iloc[:, 1:df.shape[1]].as_matrix()
X = scale(X)

In [None]:
pca = PCA(n_components=2)
Xp = pca.fit_transform(X)

In [None]:
plt.scatter(Xp[:,0], Xp[:,1], c = df['label'], alpha=0.5)
plt.show()

In [None]:
df.head(5)

In [None]:
X = df.iloc[:, 1:df.shape[1]].as_matrix()
X = scale(X)
y = df.iloc[:, 0].as_matrix()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
rf = RandomForestClassifier(n_estimators=500, n_jobs=2)
parameters = {'n_estimators': np.arange(300,1000, 20)}
rclf = GridSearchCV(rf, parameters, n_jobs=2, verbose=True)
rclf.fit(X_train, y_train)

In [35]:
rclf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=470, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [37]:
y_test_pred = rclf.predict(X_test)
y_test_true = y_test

#print(roc_auc_score(y_test_true, y_test_pred, average='macro',sample_weight=None))
print(classification_report(y_test_true, y_test_pred))

             precision    recall  f1-score   support

          0       0.98      0.99      0.99       816
          1       0.98      0.99      0.99       909
          2       0.97      0.96      0.97       846
          3       0.96      0.95      0.95       937
          4       0.96      0.97      0.97       839
          5       0.97      0.95      0.96       702
          6       0.97      0.99      0.98       785
          7       0.97      0.96      0.96       893
          8       0.95      0.95      0.95       835
          9       0.94      0.95      0.94       838

avg / total       0.97      0.97      0.97      8400



In [38]:
10.0 ** -np.arange(1, 7)

array([  1.00000000e-01,   1.00000000e-02,   1.00000000e-03,
         1.00000000e-04,   1.00000000e-05,   1.00000000e-06])

In [39]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-6,
        hidden_layer_sizes=(1000, 500, 200, ), random_state=1)

In [40]:
parameters = {'alpha':(10.0 ** -np.arange(1, 7))}
nclf = GridSearchCV(nn, parameters, n_jobs=4, verbose=True)
nclf.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


KeyboardInterrupt: 

In [None]:
clf.best_estimator_

In [None]:
y_test_pred = nclf.predict(X_test)
y_test_true = y_test

#print(roc_auc_score(y_test_true, y_test_pred, average='macro',sample_weight=None))
print(accuracy_score(y_test_true, y_test_pred, normalize=True, sample_weight=None))

In [None]:
print(classification_report(y_test_true, y_test_pred))

In [None]:
parameters = {'weights':[1,1], [1,2], [2,1]}
vc = VotingClassifier(estimators=[('rf', rclf),('nn', nclf)], voting='soft', weights=[3,1]), n_jobs=2)
fclf = GridSearchCV(vc, parameters, n_jobs=2, verbose=True)
fclf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
y_test_pred = fclf.predict(X_test)
y_test_true = y_test

#print(roc_auc_score(y_test_true, y_test_pred, average='macro',sample_weight=None))
print(accuracy_score(y_test_true, y_test_pred, normalize=True, sample_weight=None))