In [5]:
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold, train_test_split, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.multiclass import OneVsRestClassifier
import numpy as np

In [6]:
X_array = np.load('outputMatrix_userTrimmed.npz')
X = X_array['matrix']

In [4]:
X.shape

(3047, 65871)

In [7]:
# remove features with low variance (ie more than 80% samples have same value)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_sel = sel.fit_transform(X)
X_sel.shape

(3047, 2838)

In [8]:
# alternative: PCA for dimensionality reduction
#sel = PCA()
#pca = sel.fit(X)
#print pca.explained_variance_ratio_
#X_sel = sel.fit_transform(X)

In [17]:
X_sel.shape

(3047, 3047)

In [9]:
y_array = np.load('user_class_array_scheme2.npz')
y_np = y_array['matrix']

In [10]:
#X = X_np.tolist()
y = y_np.tolist()[0]

In [7]:
from collections import Counter
data = Counter(y)
data.most_common()
# zero-r
1676/float(len(y))

[(0.0, 1676), (1.0, 1371)]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.1,
                                                    random_state=9)

In [12]:
# kernel rbf
cv_clf = OneVsRestClassifier(svm.SVC(C=10.0, gamma=0.001, random_state=9))
kf = KFold(X_train.shape[0], 10)
cv_scores = cross_val_score(cv_clf, X_train, y_train, cv=kf, n_jobs=-1)
print cv_scores, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores.mean(), cv_scores.std()*2)

[ 0.92363636  0.88        0.89051095  0.86861314  0.88686131  0.8649635
  0.89781022  0.89051095  0.84671533  0.86131387] CV accuracy: 0.88 (+/- 0.04)


In [None]:
# kernel sigmoid
cv_clf_sig = OneVsRestClassifier(svm.SVC(C=10000000.0, gamma=9.9999999999999995e-07, kernel='sigmoid', random_state=9))
kf_sig = KFold(X_train.shape[0], 10)
cv_scores_sig = cross_val_score(cv_clf_sig, X_train, y_train, cv=kf_sig, n_jobs=-1)
print cv_scores_sig, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores_sig.mean(), cv_scores_sig.std()*2)

In [16]:
# kernel linear
cv_clf_lin = OneVsRestClassifier(svm.SVC(C=10.0, gamma=0.001, kernel='linear', random_state=9))
kf_lin = KFold(X_train.shape[0], 10)
cv_scores_lin = cross_val_score(cv_clf_sig, X_train, y_train, cv=kf_lin, n_jobs=-1)
print cv_scores_lin, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores_lin.mean(), cv_scores_lin.std()*2)

[ 0.91272727  0.89454545  0.89051095  0.87591241  0.89781022  0.87591241
  0.91605839  0.89051095  0.86131387  0.89416058] CV accuracy: 0.89 (+/- 0.03)


In [18]:
y_pred = cv_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred, labels=range(5)))

[[ 95  13   0   0  10]
 [ 24  27   0   0   9]
 [  1   0   0   0   1]
 [  1   2   0   1   9]
 [  6   5   0   0 101]]


In [13]:
# final parameters, rbf
clf = OneVsRestClassifier(svm.SVC(C=10.0, gamma=0.001, random_state=9))
clf.fit(X_train, y_train)

# accuracy of final model on the test set
acc = clf.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc))
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc))

Accuracy: 0.8918
Total: 305, Correctly classified: 272.0


In [None]:
# final parameters, sigmoid
clf_sig = OneVsRestClassifier(svm.SVC(C=10.0, gamma=0.001, kernel = 'sigmoid', random_state=9))
clf_sig.fit(X_train, y_train)

# accuracy of final model on the test set
acc_sig = clf_sig.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc))
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc_sig))

In [17]:
# final parameters, linear
clf_lin = OneVsRestClassifier(svm.SVC(C=10.0, gamma=0.001, kernel = 'linear', random_state=9))
clf_lin.fit(X_train, y_train)
clf_lin.set_params(C=10)

# accuracy of final model on the test set
acc_lin = clf.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc_lin)), len(y_test)
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc_lin))

Accuracy: 0.8689 305
Total: 305, Correctly classified: 265.0


In [14]:
y_pred = clf.predict(X_test)
roc_scores = roc_auc_score(y_test, y_pred, average='micro')
roc_scores

0.88812704591701319

In [19]:
y_pred_sig = clf_sig.predict(X_test)
roc_scores_sig = roc_auc_score(y_test, y_pred_sig, average='micro')
roc_scores_sig

0.86395204812881521

In [None]:
y_pred_lin = clf_lin.predict(X_test)
roc_scores_lin = roc_auc_score(y_test, y_pred_lin, average='micro')
roc_scores_lin