In [3]:
import numpy as np
from sklearn import neighbors
from sklearn.cross_validation import KFold, train_test_split, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

In [4]:
X_array = np.load('outputMatrix_userTrimmed.npz')
X = X_array['matrix']

In [5]:
# remove features with low variance (ie more than 80% samples have same value)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_sel = sel.fit_transform(X)
X_sel.shape

(3047, 2838)

In [6]:
y_array = np.load('user_class_array_scheme2.npz')
y_np = y_array['matrix']
y = y_np.tolist()[0]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.1,
                                                    random_state=9)

In [8]:
kf = KFold(X_train.shape[0], 10)
cv_clf = neighbors.KNeighborsClassifier(14, metric='jaccard')

cv_scores = cross_val_score(cv_clf, X_train, y_train, cv=kf, n_jobs=-1)
print cv_scores, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores.mean(), cv_scores.std()*2)

[ 0.90545455  0.86181818  0.8649635   0.8540146   0.86861314  0.83576642
  0.84671533  0.8649635   0.84306569  0.85766423] CV accuracy: 0.86 (+/- 0.04)


In [9]:
# grid search: best K = 19, jaccard metric
clf = neighbors.KNeighborsClassifier(14, metric='jaccard')

In [10]:
clf.fit(X_train, y_train)

acc = clf.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc)), len(y_test)
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc))

Accuracy: 0.8721 305
Total: 305, Correctly classified: 266.0


In [11]:
y_pred = clf.predict(X_test)
roc_scores = roc_auc_score(y_test, y_pred, average='micro')

In [12]:
roc_scores

0.86112094134300632