In [1]:
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold, train_test_split, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import numpy as np

In [2]:
X_array = np.load('outputMatrix_userTrimmed.npz')
X = X_array['matrix']

In [4]:
X.shape

(3047, 65871)

In [3]:
# remove features with low variance (ie more than 80% samples have same value)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_sel = sel.fit_transform(X)
X_sel.shape

(3047, 2838)

In [9]:
# alternative: PCA for dimensionality reduction
sel = PCA()
pca = sel.fit(X)
print pca.explained_variance_ratio_
X_sel = sel.fit_transform(X)

[  8.34908026e-02   1.50086412e-02   9.66355408e-03 ...,   2.39873001e-06
   7.73328809e-14   1.21697822e-32]


In [17]:
X_sel.shape

(3047, 3047)

In [4]:
y_array = np.load('user_class_array_scheme2.npz')
y_np = y_array['matrix']

In [5]:
#X = X_np.tolist()
y = y_np.tolist()[0]

In [7]:
from collections import Counter
data = Counter(y)
data.most_common()

[(0.0, 1676), (1.0, 1371)]

In [9]:
# zero-r
1676/float(len(y))

0.5500492287495897

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.1,
                                                    random_state=9)

In [None]:
# svm parameters
# C = 10^-5 -> 10^5

In [15]:
cv_clf = svm.SVC()
cv_clf.set_params(C=10.0, gamma=0.001)
kf = KFold(X_train.shape[0], 10)
cv_scores = cross_val_score(cv_clf, X_train, y_train, cv=kf, n_jobs=-1)
print cv_scores, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores.mean(), cv_scores.std()*2)
#[cv_clf.fit(X_train[train_indices], y_train[train_indices]).score(X_train[test_indices],y_train[test_indices])
#for train_indices, test_indices in kf_total]

[ 0.81818182  0.79272727  0.82116788  0.83211679  0.82116788  0.74452555
  0.82116788  0.83941606  0.81021898  0.81021898] CV accuracy: 0.81 (+/- 0.05)


In [16]:
cv_clf = svm.LinearSVC()
cv_clf.set_params(C=10.0) #, gamma=0.001)
kf = KFold(X_train.shape[0], 10)
cv_scores = cross_val_score(cv_clf, X_train, y_train, cv=kf, n_jobs=-1)
print cv_scores, 'CV accuracy: %0.2f (+/- %0.2f)' % (cv_scores.mean(), cv_scores.std()*2)

[ 0.91272727  0.89454545  0.89051095  0.87591241  0.89781022  0.87591241
  0.91605839  0.89051095  0.86131387  0.89416058] CV accuracy: 0.89 (+/- 0.03)


In [15]:
cv_clf.fit(X_train, y_train)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
y_pred = cv_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred, labels=range(5)))

[[ 95  13   0   0  10]
 [ 24  27   0   0   9]
 [  1   0   0   0   1]
 [  1   2   0   1   9]
 [  6   5   0   0 101]]


In [11]:
# set final parameters
clf = svm.SVC()
clf.fit(X_train, y_train)
clf.set_params(C=10, gamma=0.001)

# accuracy of final model on the test set
acc = clf.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc))
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc))

Accuracy: 0.8984
Total: 305, Correctly classified: 274.0


In [12]:
# set final parameters
clf = svm.LinearSVC()
clf.fit(X_train, y_train)
clf.set_params(C=10)

# accuracy of final model on the test set
acc = clf.score(X_test, y_test)
print('Accuracy: {:.4f}'.format(acc)), len(y_test)
print ('Total: ' + str(len(y_test)) + ', Correctly classified: ' + str(len(y_test)*acc))

Accuracy: 0.8984 305
Total: 305, Correctly classified: 274.0
