In [26]:
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold, train_test_split, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
import json

In [27]:
X_array = np.load('outputMatrix_userTrimmed.npz')
X = X_array['matrix']

In [3]:
X.shape

(3047, 65871)

In [28]:
# remove features with low variance (ie more than 80% samples have same value)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_sel = sel.fit_transform(X)
X_sel.shape

(3047, 2838)

In [81]:
# indices for features, so we can trace back to original n_gram map
indices = sel.get_support(indices=True)

indices_dict = {}
count = 0
for i in range(indices.shape[0]):
    indices_dict[count] = indices[i]
    count += 1

In [25]:
# alternative: PCA for dimensionality reduction
#sel = PCA()
#pca = sel.fit(X)
#print pca.explained_variance_ratio_
#X_sel = sel.fit_transform(X)

In [34]:
y_array = np.load('user_class_array.npz')
y_np = y_array['matrix']
y = y_np.tolist()[0]
y = label_binarize(y, classes=[0, 1, 2, 3, 4])

In [8]:
# zero-r
from collections import Counter
data = Counter(y)
print data.most_common()
print 1195/float(len(y))

[(4.0, 1195), (0.0, 1053), (1.0, 623), (3.0, 152), (2.0, 24)]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.1,
                                                    random_state=9)

In [55]:
# kernel rbf
cv_clf = OneVsRestClassifier(svm.SVC(C=10.0, gamma=0.001, random_state=9))
#cv_clf.set_params(C=0.01, kernel='linear')
#cv_clf.set_params(C=10.0, gamma=0.001, kernel='poly')
#cv_clf.set_params(C=10000.0, gamma=9.9999999999999995e-07, kernel='sigmoid')
kf = KFold(X_train.shape[0], 10)
cv_scores = cross_val_score(cv_clf, X_train, y_train, cv=kf, n_jobs=-1)
print cv_scores, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores.mean(), cv_scores.std()*2)

[ 0.64        0.58545455  0.60948905  0.56569343  0.61313869  0.54744526
  0.64963504  0.56569343  0.60218978  0.60948905] CV accuracy: 0.60 (+/- 0.06)


In [43]:
# kernel sigmoid
cv_clf_sig = OneVsRestClassifier(svm.SVC(C=10000.0, gamma=9.9999999999999995e-07, kernel='sigmoid', random_state=9))
kf_sig = KFold(X_train.shape[0], 10)
cv_scores_sig = cross_val_score(cv_clf_sig, X_train, y_train, cv=kf_sig, n_jobs=-1)
print cv_scores_sig, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores_sig.mean(), cv_scores_sig.std()*2)

[ 0.55272727  0.53818182  0.54014599  0.50729927  0.51459854  0.50364964
  0.58759124  0.51459854  0.54014599  0.5       ] CV accuracy: 0.53 (+/- 0.05)


In [48]:
# kernel linear
cv_clf_lin = OneVsRestClassifier(svm.SVC(C=0.01, gamma=1.0000000000000001e-09, kernel='linear', random_state=9))
kf_lin = KFold(X_train.shape[0], 10)
cv_scores_lin = cross_val_score(cv_clf_sig, X_train, y_train, cv=kf_lin, n_jobs=-1)
print cv_scores_lin, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores_lin.mean(), cv_scores_lin.std()*2)

[ 0.55272727  0.53818182  0.54014599  0.50729927  0.51459854  0.50364964
  0.58759124  0.51459854  0.54014599  0.5       ] CV accuracy: 0.53 (+/- 0.05)


In [7]:
#cv_clf = svm.LinearSVC()
#cv_clf.set_params(C=10.0) #, gamma=0.001)
#kf = KFold(X_train.shape[0], 10)
#cv_scores = cross_val_score(cv_clf, X_train, y_train, cv=kf, n_jobs=-1)
#print cv_scores, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores.mean(), cv_scores.std()*2)

[ 0.68        0.64363636  0.64233577  0.64963504  0.63868613  0.62043796
  0.67518248  0.61313869  0.6350365   0.61678832] CV accuracy: 0.64 (+/- 0.04)


In [16]:
cv_clf.fit(X_train, y_train)

LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [36]:
feature_weights = cv_clf.coef_

ValueError: coef_ is only available when using a linear kernel

In [32]:
feature_weights_class1 = feature_weights[0]
feature_weights_class2 = feature_weights[1]
feature_weights_class3 = feature_weights[2]
feature_weights_class4 = feature_weights[3]
feature_weights_class5 = feature_weights[4]

In [47]:
top_keys_class1 = sorted(range(len(feature_weights_class1)), key = lambda i: feature_weights_class1[i])[-3:]
top_keys_class2 = sorted(range(len(feature_weights_class2)), key = lambda i: feature_weights_class2[i])[-3:]
top_keys_class3 = sorted(range(len(feature_weights_class3)), key = lambda i: feature_weights_class3[i])[-3:]
top_keys_class4 = sorted(range(len(feature_weights_class4)), key = lambda i: feature_weights_class4[i])[-3:]
top_keys_class5 = sorted(range(len(feature_weights_class5)), key = lambda i: feature_weights_class5[i])[-3:]

In [54]:
with open('ngram_map_trim.txt', 'r') as rf:
    ngram_map_trim = json.loads(rf.read())

In [56]:
ngram_map_trim_flip = dict((v,k) for k,v in ngram_map_trim.iteritems())

In [97]:
ngram_map_trim_flip[indices_dict[top_keys_class5[2]]]

u'gap'

In [18]:
y_pred = cv_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred, labels=range(5)))

[[ 95  13   0   0  10]
 [ 24  27   0   0   9]
 [  1   0   0   0   1]
 [  1   2   0   1   9]
 [  6   5   0   0 101]]


In [56]:
# set final parameters
clf = OneVsRestClassifier(svm.SVC(C=10.0, gamma=0.001, random_state=9))
clf.fit(X_train, y_train)
#clf.set_params(C=10, gamma=0.001)
#clf.set_params(C=0.01, kernel='linear')
#clf.set_params(C=10, gamma=0.001, kernel='poly')
#clf.set_params(C=10000.0, gamma=9.9999999999999995e-07, kernel='sigmoid')

# accuracy of final model on the test set
acc = clf.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc))
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc))

Accuracy: 0.6295
Total: 305, Correctly classified: 192.0


In [45]:
# set final parameters
clf_sig = OneVsRestClassifier(svm.SVC(C=10000.0, gamma=9.9999999999999995e-07, kernel='sigmoid', random_state=9))
clf_sig.fit(X_train, y_train)

# accuracy of final model on the test set
acc_sig = clf_sig.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc_sig)), len(y_test)
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc_sig))

Accuracy: 0.5574 305
Total: 305, Correctly classified: 170.0


In [49]:
# set final parameters
clf_lin = OneVsRestClassifier(svm.SVC(C=0.01, gamma=1.0000000000000001e-09, kernel='linear', random_state=9))
clf_lin.fit(X_train, y_train)

# accuracy of final model on the test set
acc_lin = clf_lin.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc_lin)), len(y_test)
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc_lin))

Accuracy: 0.5574 305
Total: 305, Correctly classified: 170.0


In [128]:
#def vectorize(labels):
#    vectorized_labels = np.zeros((len(labels), 5))
#    for i in range(len(labels)):  
#        vectorized_labels[i][int(labels[i])] = 1
#    return vectorized_labels

#y_test_vect = vectorize(y_test)
#y_pred_vect = vectorize(clf.predict(X_test))
#roc_scores = roc_auc_score(y_test_vect, y_pred_vect)

In [57]:
y_test_vect = label_binarize(y_test, classes=[0, 1, 2, 3, 4])
y_pred = clf.predict(X_test)
y_pred_vect = label_binarize(y_pred, classes=[0, 1, 2, 3, 4])
roc_scores = roc_auc_score(y_test_vect, y_pred_vect, average='micro')
roc_scores

0.8065573770491804

In [58]:
y_test_vect = label_binarize(y_test, classes=[0, 1, 2, 3, 4])
y_pred = clf_sig.predict(X_test)
y_pred_vect = label_binarize(y_pred, classes=[0, 1, 2, 3, 4])
roc_scores_sig = roc_auc_score(y_test_vect, y_pred_vect, average='micro')
roc_scores_sig

0.78647540983606556

In [59]:
y_test_vect = label_binarize(y_test, classes=[0, 1, 2, 3, 4])
y_pred = clf_lin.predict(X_test)
y_pred_vect = label_binarize(y_pred, classes=[0, 1, 2, 3, 4])
roc_scores_lin = roc_auc_score(y_test_vect, y_pred_vect, average='micro')
roc_scores_lin

0.78647540983606556