# SVM Parameter Optimisation

## Load required imports

In [2]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from loader import load_preprocessed_data
from lookup_tables import topic_code_to_topic_dict

## Load training and test data

In [3]:
train_x, train_y = load_preprocessed_data('data/rcv1_baseline.csv')
test_x, test_y = load_preprocessed_data('data/rcv1_baseline.csv')

## Find the best parameters on the training set

In [4]:
classifier = Pipeline([
    ('vect', CountVectorizer(lowercase = False,
                             binary = False,
                             ngram_range = (1,1)
                             )),
    ("linear_svc", LinearSVC(loss='hinge', class_weight='balanced', max_iter=10000))])

parameters = {
    'linear_svc__C': [0.001, 0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(classifier, parameters, cv=3, iid=False, verbose=10)
grid_search.fit(train_x, train_y)

# Print out the best parameters found
print('Best parameters found')
print(grid_search.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... linear_svc__C=0.001, score=0.9724567153308836, total=  16.5s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.8s remaining:    0.0s


[CV] .... linear_svc__C=0.001, score=0.9733792464075227, total=  17.7s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   59.6s remaining:    0.0s


[CV] .... linear_svc__C=0.001, score=0.9731796960365551, total=  32.2s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.8min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9751051080875294, total=  18.6s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.3min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9750347659095424, total=  16.3s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.7min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9756630575146519, total=  21.4s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.2min remaining:    0.0s


[CV] ...... linear_svc__C=0.1, score=0.9723574006025093, total=  24.8s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.7min remaining:    0.0s


[CV] ...... linear_svc__C=0.1, score=0.9707304152042912, total=  23.5s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.2min remaining:    0.0s


[CV] ...... linear_svc__C=0.1, score=0.9721532399589418, total=  22.7s
[CV] linear_svc__C=1 .................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.7min remaining:    0.0s


[CV] ........ linear_svc__C=1, score=0.9712318336809349, total=  35.1s
[CV] linear_svc__C=1 .................................................
[CV] ........ linear_svc__C=1, score=0.9697371035030793, total=  32.8s
[CV] linear_svc__C=1 .................................................




[CV] ........ linear_svc__C=1, score=0.9705638886129598, total=  54.6s
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.9695765882080313, total= 1.4min
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.9678498112707767, total= 1.3min
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.9690738717261018, total= 1.4min


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 11.6min finished


Best parameters found
{'linear_svc__C': 0.01}


# Make predictions on the test set

In [5]:
predict_y = grid_search.predict(test_x)
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.992087  0.987003  0.989538     29852
  ECONOMIC PERFORMANCE   0.990389  0.995532  0.992954      8281
             ELECTIONS   0.985511  0.989832  0.987667     10720
                HEALTH   0.976525  0.993874  0.985123      4897
              RELIGION   0.981231  0.990745  0.985965      2269
                SPORTS   0.996324  0.994999  0.995661     34591

             micro avg   0.991634  0.991634  0.991634     90610
             macro avg   0.987011  0.991997  0.989485     90610
          weighted avg   0.991658  0.991634  0.991638     90610

