# SVM Parameter Optimisation

## Load required imports

In [14]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from loader import load_preprocessed_data
from lookup_tables import topic_to_int

## Load training and test data

In [7]:
train_x, train_y = load_preprocessed_data('data/ohsumed_no_stopwords_train.csv')
test_x, test_y = load_preprocessed_data('data/ohsumed_no_stopwords_test.csv')

## Find the best parameters on the training set

In [9]:
ohsumed_classifier = Pipeline([
    ('vect', CountVectorizer(lowercase = False,
                             binary = False,
                             ngram_range = (1,1)
                             )),
    ("linear_svc", LinearSVC(loss='hinge', class_weight='balanced', max_iter=10000))])

parameters = {
    'linear_svc__C': [0.001, 0.005, 0.01, 0.05, 0.1, 1, 10]
}

grid_search = GridSearchCV(ohsumed_classifier, parameters, cv=3, iid=False, verbose=10)
grid_search.fit(train_x, train_y)

# Print out the best parameters found
print('Best parameters found')
print(grid_search.best_params_)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... linear_svc__C=0.001, score=0.7056737588652482, total=   8.7s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.4s remaining:    0.0s


[CV] .... linear_svc__C=0.001, score=0.7048229756463606, total=   7.9s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   23.7s remaining:    0.0s


[CV] ..... linear_svc__C=0.001, score=0.707761621953717, total=   7.9s
[CV] linear_svc__C=0.005 .............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   35.1s remaining:    0.0s


[CV] .... linear_svc__C=0.005, score=0.7359519912711402, total=  11.3s
[CV] linear_svc__C=0.005 .............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   49.8s remaining:    0.0s


[CV] .... linear_svc__C=0.005, score=0.7381813220547104, total=   9.7s
[CV] linear_svc__C=0.005 .............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min remaining:    0.0s


[CV] ..... linear_svc__C=0.005, score=0.738139122124377, total=   9.2s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s


[CV] ...... linear_svc__C=0.01, score=0.736088379705401, total=  13.6s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.7366805375537213, total=  10.1s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.8min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.7373882176257766, total=  11.0s
[CV] linear_svc__C=0.05 ..............................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.0min remaining:    0.0s


[CV] ...... linear_svc__C=0.05, score=0.710038188761593, total=  13.5s
[CV] linear_svc__C=0.05 ..............................................
[CV] ..... linear_svc__C=0.05, score=0.7151920321986492, total=  14.2s
[CV] linear_svc__C=0.05 ..............................................
[CV] ..... linear_svc__C=0.05, score=0.7164994197556147, total=  12.7s
[CV] linear_svc__C=0.1 ...............................................
[CV] ...... linear_svc__C=0.1, score=0.6976268412438625, total=  14.5s
[CV] linear_svc__C=0.1 ...............................................
[CV] ...... linear_svc__C=0.1, score=0.7058462378061259, total=  15.4s
[CV] linear_svc__C=0.1 ...............................................
[CV] ...... linear_svc__C=0.1, score=0.7066011331831524, total=  14.4s
[CV] linear_svc__C=1 .................................................
[CV] ........ linear_svc__C=1, score=0.6826923076923077, total=  15.1s
[CV] linear_svc__C=1 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:  5.7min finished


Best parameters found
{'linear_svc__C': 0.005}


# Make predictions on the test set

In [16]:
predict_y = grid_search.predict(test_x)
print(classification_report(test_y, predict_y, digits=6, target_names=topic_to_int.keys()))

                                                               precision    recall  f1-score   support

                             Bacterial Infections and Mycoses   0.871470  0.813636  0.841561      1100
                                               Virus Diseases   0.340741  0.500000  0.405286        92
                                           Parasitic Diseases   0.839050  0.943620  0.888268       337
                                                    Neoplasms   0.634188  0.672101  0.652595       552
                                     Musculoskeletal Diseases   0.181818  0.285714  0.222222        56
                                    Digestive System Diseases   0.841300  0.840497  0.840898      1047
                                      Stomatognathic Diseases   0.820862  0.914141  0.864994       396
                                   Respiratory Tract Diseases   0.620499  0.756757  0.681887       296
                                Otorhinolaryngologic Diseases   0.734982