# SVM Parameter Optimisation

## Load required imports

In [1]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from loader import load_preprocessed_data
from lookup_tables import topic_to_int

## Load training and test data

In [2]:
train_x, train_y = load_preprocessed_data('data/uvigomed_train.csv')
test_x, test_y = load_preprocessed_data('data/uvigomed_test.csv')

## Find the best parameters on the training set

In [3]:
classifier = Pipeline([
    ('vect', CountVectorizer(lowercase = False,
                             binary = False,
                             ngram_range = (1,1)
                             )),
    ("linear_svc", LinearSVC(loss='hinge', class_weight='balanced', max_iter=10000))])

parameters = {
    'linear_svc__C': [0.001, 0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(classifier, parameters, cv=3, iid=False, verbose=10)
grid_search.fit(train_x, train_y)

# Print out the best parameters found
print('Best parameters found')
print(grid_search.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... linear_svc__C=0.001, score=0.7021932830705963, total=   7.9s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.3s remaining:    0.0s


[CV] .... linear_svc__C=0.001, score=0.7090734517522803, total=   7.5s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   22.3s remaining:    0.0s


[CV] .... linear_svc__C=0.001, score=0.7098655323819978, total=   7.6s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   33.2s remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.7344756682659356, total=  10.0s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   46.5s remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.7349975996159386, total=  10.2s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   60.0s remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.7377195389681669, total=   9.6s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.2min remaining:    0.0s


[CV] ...... linear_svc__C=0.1, score=0.6967786154900617, total=  15.8s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s


[CV] ...... linear_svc__C=0.1, score=0.7077018037171662, total=  13.4s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.8min remaining:    0.0s


[CV] ...... linear_svc__C=0.1, score=0.7043770581778266, total=  12.9s
[CV] linear_svc__C=1 .................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.1min remaining:    0.0s


[CV] ........ linear_svc__C=1, score=0.6853324194653873, total=  12.5s
[CV] linear_svc__C=1 .................................................
[CV] ........ linear_svc__C=1, score=0.6927508401344215, total=  14.0s
[CV] linear_svc__C=1 .................................................
[CV] ........ linear_svc__C=1, score=0.6904500548847421, total=  14.7s
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.6844413982179575, total=  13.2s
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.6920650161168644, total=  14.3s
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.6875686059275521, total=  15.5s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.8min finished


Best parameters found
{'linear_svc__C': 0.01}


# Make predictions on the test set

In [4]:
predict_y = grid_search.predict(test_x)
print(classification_report(test_y, predict_y, digits=6, target_names=topic_to_int.keys()))

                                                               precision    recall  f1-score   support

                             Bacterial Infections and Mycoses   0.873031  0.808569  0.839565      1097
                                               Virus Diseases   0.364407  0.467391  0.409524        92
                                           Parasitic Diseases   0.849866  0.943452  0.894217       336
                                                    Neoplasms   0.615512  0.678182  0.645329       550
                                     Musculoskeletal Diseases   0.200000  0.250000  0.222222        56
                                    Digestive System Diseases   0.838586  0.841802  0.840191      1043
                                      Stomatognathic Diseases   0.796380  0.897959  0.844125       392
                                   Respiratory Tract Diseases   0.626087  0.729730  0.673947       296
                                Otorhinolaryngologic Diseases   0.765799