# Import the libraries and features

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support


%load_ext autoreload
%autoreload 2

features = np.load('features.npy')
labels = np.load('labels.npy')
testdata = np.load('test.npy')


print(features.shape)
print(labels.shape)
print(testdata.shape)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
(200000, 80)
(200000,)
(10000, 80)


In [2]:
features = preprocessing.normalize(features, norm='l2')
testdata = preprocessing.normalize(testdata, norm='l2')

print(features.shape)
print(labels.shape)
print(testdata.shape)


(200000, 80)
(200000,)
(10000, 80)


# Todo

implement a linear classifier on our features

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=0)
X_train.shape, y_train.shape
X_test.shape, y_test.shape


C = 1

clf = LogisticRegression(C=C, solver='lbfgs', multi_class='multinomial')
clf.fit(X_train, y_train)
print("Logistic : ", clf.score(X_test, y_test))

precision, recall, _, _ = precision_recall_fscore_support(y_test, clf.predict(X_test), average='macro')
print("Precision", precision)
print("Recall", recall)

Logistic :  0.606
Precision 0.608371641888
Recall 0.606370301758


In [10]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200,100), random_state=1)
clf.fit(X_train, y_train)
print("Neural : ", clf.score(X_test, y_test))

precision, recall, _, _ = precision_recall_fscore_support(y_test, clf.predict(X_test), average='macro')
print("Precision", precision)
print("Recall", recall)

Neural :  0.6545
Precision 0.670417509569
Recall 0.655308588798


In [9]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X_train, y_train)
print("SVC : ", clf.score(X_test, y_test))

precision, recall, _, _ = precision_recall_fscore_support(y_test, clf.predict(X_test), average='macro')
print("Precision", precision)
print("Recall", recall)    

Neural :  0.6196
Precision 0.636904360312
Recall 0.620537174864


# Cross Validation


In [5]:
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier



kf = KFold(n_splits=4, shuffle=True)
kf.get_n_splits(features)
k = 0;
C = 1.0
for train_index, test_index in kf.split(features):
    print("fold = ", k)
    k+=1
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    clf = LogisticRegression(C=C, penalty='l1')
    clf.fit(X_train, y_train)

    print("Logistic : ", clf.score(X_test, y_test))
    
    clf = SGDClassifier(loss='hinge', penalty='l2', alpha=0.001, l1_ratio=0.1, fit_intercept=True)
    clf.fit(X_train, y_train)

    print("SGD : ", clf.score(X_test, y_test))





fold =  0
TRAIN: [     0      2      4 ..., 199997 199998 199999] TEST: [     1      3      5 ..., 199991 199993 199995]
Logistic :  0.59096
SGD :  0.58896
fold =  1
TRAIN: [     0      1      3 ..., 199997 199998 199999] TEST: [     2     10     11 ..., 199990 199992 199994]
Logistic :  0.59034
SGD :  0.5944
fold =  2
TRAIN: [     0      1      2 ..., 199997 199998 199999] TEST: [     4      8     18 ..., 199987 199988 199996]
Logistic :  0.59584
SGD :  0.59294
fold =  3
TRAIN: [     1      2      3 ..., 199994 199995 199996] TEST: [     0      7      9 ..., 199997 199998 199999]
Logistic :  0.59326
SGD :  0.5899


# Output 

In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_predict

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200,), random_state=1)
clf.fit(features, labels)
print("Score : ", clf.score(features, labels))
y_pred = clf.predict(testdata)




KeyboardInterrupt: 


Score without division : 0.671045

Score with division : 0.560515

In [123]:
numbers = np.arange(len(y_pred))+1

from helpers import create_csv_submission
create_csv_submission(numbers, y_pred, "predict.txt")