# SVM Parameter Optimisation

## Load required imports

In [2]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from loader import load_preprocessed_data
from lookup_tables import topic_code_to_topic_dict

## Load training and test data

In [3]:
train_x, train_y = load_preprocessed_data('data/rcv1_lemmatized.csv')
test_x, test_y = load_preprocessed_data('data/rcv1_lemmatized.csv')

## Find the best parameters on the training set

In [4]:
ohsumed_classifier = Pipeline([
    ('vect', CountVectorizer(lowercase = False,
                             binary = False,
                             ngram_range = (1,1)
                             )),
    ("linear_svc", LinearSVC(loss='hinge', class_weight='balanced', max_iter=10000))])

parameters = {
    'linear_svc__C': [0.001, 0.005, 0.01, 0.05, 0.1, 1, 10]
}

grid_search = GridSearchCV(ohsumed_classifier, parameters, cv=3, iid=False, verbose=10)
grid_search.fit(train_x, train_y)

# Print out the best parameters found
print('Best parameters found')
print(grid_search.best_params_)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... linear_svc__C=0.001, score=0.9724567153308836, total=  15.6s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.2s remaining:    0.0s


[CV] .... linear_svc__C=0.001, score=0.9734454671876035, total=  15.7s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   48.5s remaining:    0.0s


[CV] .... linear_svc__C=0.001, score=0.9731796960365551, total=  16.7s
[CV] linear_svc__C=0.005 .............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min remaining:    0.0s


[CV] .... linear_svc__C=0.005, score=0.9745092197172841, total=  17.4s
[CV] linear_svc__C=0.005 .............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.7min remaining:    0.0s


[CV] ..... linear_svc__C=0.005, score=0.975167207469704, total=  17.6s
[CV] linear_svc__C=0.005 .............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min remaining:    0.0s


[CV] ..... linear_svc__C=0.005, score=0.975563723055528, total=  20.2s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.6min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9750720031780713, total=  20.6s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.1min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9750347659095424, total=  19.0s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.5min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9756299460282772, total=  20.7s
[CV] linear_svc__C=0.05 ..............................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.0min remaining:    0.0s


[CV] ..... linear_svc__C=0.05, score=0.9732512331578773, total=  29.3s
[CV] linear_svc__C=0.05 ..............................................
[CV] ..... linear_svc__C=0.05, score=0.9728825905569167, total=  25.0s
[CV] linear_svc__C=0.05 ..............................................
[CV] ..... linear_svc__C=0.05, score=0.9729810271183074, total=  28.0s
[CV] linear_svc__C=0.1 ...............................................
[CV] ...... linear_svc__C=0.1, score=0.9723905055119674, total=  24.8s
[CV] linear_svc__C=0.1 ...............................................
[CV] ...... linear_svc__C=0.1, score=0.9707304152042912, total=  27.6s
[CV] linear_svc__C=0.1 ...............................................
[CV] ...... linear_svc__C=0.1, score=0.9721201284725671, total=  23.9s
[CV] linear_svc__C=1 .................................................
[CV] ........ linear_svc__C=1, score=0.9711656238620188, total=  39.8s
[CV] linear_svc__C=1 .................................................
[CV] .



[CV] ........ linear_svc__C=1, score=0.9705307771265852, total= 1.1min
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.9696096931174893, total= 1.5min
[CV] linear_svc__C=10 ................................................
[CV] ........ linear_svc__C=10, score=0.967783590490696, total= 1.5min
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.9689414257806033, total= 1.7min


[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed: 15.6min finished


Best parameters found
{'linear_svc__C': 0.01}


# Make predictions on the test set

In [6]:
predict_y = grid_search.predict(test_x)
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.992154  0.986969  0.989555     29852
  ECONOMIC PERFORMANCE   0.990389  0.995532  0.992954      8281
             ELECTIONS   0.985420  0.989832  0.987621     10720
                HEALTH   0.976525  0.993874  0.985123      4897
              RELIGION   0.981231  0.990745  0.985965      2269
                SPORTS   0.996295  0.995028  0.995661     34591

             micro avg   0.991634  0.991634  0.991634     90610
             macro avg   0.987002  0.991997  0.989480     90610
          weighted avg   0.991659  0.991634  0.991638     90610

