In [2]:
from sklearn import ensemble
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import sklearn
from matplotlib import pyplot as plt
import itertools
% matplotlib inline
from sklearn.utils.class_weight import compute_sample_weight

In [15]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [3]:
data_train = np.load('../data/train_data.npy')
data_val = np.load('../data/val_data.npy')
label_train = np.load('../data/train_label.npy')
label_val = np.load('../data/val_label.npy')

data_test = np.load('../data/test_data.npy')

In [4]:
pca = PCA(n_components=100, whiten=False).fit(data_train)

In [5]:
# keeped compoents
print(sum(pca.explained_variance_ratio_))

0.323047415126


In [6]:
X = pca.transform(data_train)
y = label_train.astype(np.int)
weights = compute_sample_weight(class_weight='balanced', y=label_train)

In [8]:
cls = SVC()

In [9]:
cls.fit(X, y, sample_weight=weights)
#cls.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
pred = cls.predict(X)
sklearn.metrics.accuracy_score(y, pred)

In [12]:
X_v = pca.transform(data_val)
y_v = label_val.astype(np.int)

In [13]:
pred_v = cls.predict(X_v)
conf = sklearn.metrics.confusion_matrix(y_v, pred_v)

In [16]:
sklearn.metrics.accuracy_score(y_v, pred_v)

0.28780697975010772

In [212]:
print(sklearn.metrics.classification_report(y_v, pred_v))

             precision    recall  f1-score   support

          0       1.00      0.13      0.23        39
          1       1.00      0.29      0.45        96
          2       1.00      0.22      0.36       207
          3       0.01      0.75      0.02        16
          4       0.67      1.00      0.80        10
          5       0.33      0.87      0.47        15
          6       0.75      1.00      0.86         3
          7       0.00      0.00      0.00         8
          8       0.33      0.33      0.33         6
          9       0.75      0.92      0.83        83
         10       0.73      0.78      0.75        67
         11       0.71      0.80      0.75        71
         12       0.00      0.00      0.00         9
         13       0.00      0.00      0.00        49
         14       0.00      0.00      0.00      1006
         15       1.00      1.00      1.00        16
         16       0.00      0.00      0.00       423
         17       1.00      1.00      1.00   

In [24]:
X_test = pca.transform(data_test)
pred_test = cls.predict(X_test)

In [18]:
np.save('../pred_svm_val.npy', pred_v)
np.save('../pred_svm_test.npy', pred_test)

In [None]:
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure(figsize=(30,30))
plot_confusion_matrix(conf, classes=range(46),
                      title='Confusion matrix, without normalization')
plt.show()