In [2]:
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold, train_test_split, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import numpy as np

In [3]:
# unigrams
X_unigrams_array = np.load('outputMatrix_unigramsTrimmed.npz')
X_unigrams = X_unigrams_array['matrix']

In [5]:
X_unigrams.shape

(3047, 18243)

In [7]:
# remove features with low variance (ie more than 80% samples have same value)
unigrams_sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_unigrams_sel = unigrams_sel.fit_transform(X_unigrams)
X_unigrams_sel.shape

(3047, 1500)

In [10]:
# alternative: PCA for dimensionality reduction
sel = PCA()
pca = sel.fit(X)
print pca.explained_variance_ratio_
X_sel = sel.fit_transform(X)

[  8.34908026e-02   1.50086412e-02   9.66355408e-03 ...,   2.39873001e-06
   7.73328809e-14   1.21697822e-32]


In [9]:
y_array = np.load('user_class_array.npz')
y_np = y_array['matrix']
y = y_np.tolist()[0]

In [8]:
from collections import Counter
data = Counter(y)
data.most_common()

[(4.0, 1195), (0.0, 1053), (1.0, 623), (3.0, 152), (2.0, 24)]

In [9]:
# zero-r
1195/float(len(y))

0.39218903839842467

In [13]:
X_uni_train, X_uni_test, y_uni_train, y_uni_test = train_test_split(X_unigrams, y, test_size=0.1,
                                                    random_state=9)

In [None]:
# svm parameters
# C = 10^-5 -> 10^5

In [None]:
cv_clf = svm.SVC()
cv_clf.set_params(C=10.0, gamma=0.001)
#cv_clf.set_params(C=0.01, kernel='linear')
#cv_clf.set_params(C=10.0, gamma=0.001, kernel='poly')
#cv_clf.set_params(C=10000.0, gamma=9.9999999999999995e-07, kernel='sigmoid')
kf = KFold(X_uni_train.shape[0], 10)
cv_scores = cross_val_score(cv_clf, X_uni_train, y_uni_train, cv=kf, n_jobs=-1)
print cv_scores, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores.mean(), cv_scores.std()*2)
#[cv_clf.fit(X_train[train_indices], y_train[train_indices]).score(X_train[test_indices],y_train[test_indices])
#for train_indices, test_indices in kf_total]

In [39]:
cv_clf = svm.LinearSVC()
cv_clf.set_params(C=10.0) #, gamma=0.001)
kf = KFold(X_train.shape[0], 10)
cv_scores = cross_val_score(cv_clf, X_train, y_train, cv=kf, n_jobs=-1)
print cv_scores, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores.mean(), cv_scores.std()*2)

[ 0.68        0.64363636  0.64233577  0.64963504  0.63868613  0.62043796
  0.67518248  0.61313869  0.6350365   0.61678832] CV accuracy: 0.64 (+/- 0.04)


In [15]:
cv_clf.fit(X_train, y_train)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
y_pred = cv_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred, labels=range(5)))

[[ 95  13   0   0  10]
 [ 24  27   0   0   9]
 [  1   0   0   0   1]
 [  1   2   0   1   9]
 [  6   5   0   0 101]]


In [15]:
# set final parameters
clf = svm.SVC()
clf.fit(X_train, y_train)
#clf.set_params(C=10, gamma=0.001)
#clf.set_params(C=0.01, kernel='linear')
#clf.set_params(C=10, gamma=0.001, kernel='poly')
clf.set_params(C=10000.0, gamma=9.9999999999999995e-07, kernel='sigmoid')

# accuracy of final model on the test set
acc = clf.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc))
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc))

Accuracy: 0.7311
Total: 305, Correctly classified: 223.0


In [13]:
# set final parameters
clf = svm.LinearSVC()
clf.fit(X_train, y_train)
clf.set_params(C=10)

# accuracy of final model on the test set
acc = clf.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc)), len(y_test)
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc))

Accuracy: 0.7049 305
Total: 305, Correctly classified: 215.0
