# SVM Parameter Optimisation

## Load required imports

In [1]:
# Make common scripts visible
import sys
sys.path.append('../common/')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from loader import load_preprocessed_data
from lookup_tables import topic_code_to_topic_dict

## Load training and test data

In [2]:
x, y = load_preprocessed_data('data/rcv1_baseline.csv')

total_examples = len(y)
split_point = int(total_examples * 0.8)
train_x = x[:split_point]
train_y = y[:split_point]
test_x = x[split_point:]
test_y = y[split_point:]

## Find the best parameters on the training set

In [3]:
classifier = Pipeline([
    ('vect', CountVectorizer(lowercase = False,
                             binary = False,
                             ngram_range = (1,1)
                             )),
    ("linear_svc", LinearSVC(loss='hinge', class_weight='balanced', max_iter=10000))])

parameters = {
    'linear_svc__C': [0.001, 0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(classifier, parameters, cv=3, iid=False, verbose=10)
grid_search.fit(train_x, train_y)

# Print out the best parameters found
print('Best parameters found')
print(grid_search.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... linear_svc__C=0.001, score=0.9716934282403575, total=  12.6s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.3s remaining:    0.0s


[CV] ..... linear_svc__C=0.001, score=0.971319786450358, total=  12.4s
[CV] linear_svc__C=0.001 .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   38.7s remaining:    0.0s


[CV] .... linear_svc__C=0.001, score=0.9725590828194197, total=  11.9s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   57.2s remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9745489157424267, total=  12.9s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9726855109051028, total=  15.7s
[CV] linear_svc__C=0.01 ..............................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min remaining:    0.0s


[CV] ..... linear_svc__C=0.01, score=0.9742560324489881, total=  18.2s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.1min remaining:    0.0s


[CV] ...... linear_svc__C=0.1, score=0.9719417315014071, total=  17.5s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  2.5min remaining:    0.0s


[CV] ...... linear_svc__C=0.1, score=0.9692918925630095, total=  16.0s
[CV] linear_svc__C=0.1 ...............................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.8min remaining:    0.0s


[CV] ....... linear_svc__C=0.1, score=0.971069078266628, total=  21.5s
[CV] linear_svc__C=1 .................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  3.3min remaining:    0.0s


[CV] ........ linear_svc__C=1, score=0.9709071345803675, total=  27.6s
[CV] linear_svc__C=1 .................................................
[CV] ........ linear_svc__C=1, score=0.9682986384141042, total=  22.6s
[CV] linear_svc__C=1 .................................................
[CV] ........ linear_svc__C=1, score=0.9699515748520343, total= 3.6min
[CV] linear_svc__C=10 ................................................




[CV] ....... linear_svc__C=10, score=0.9697070021519616, total= 1.1min
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.9670156851384348, total=  52.9s
[CV] linear_svc__C=10 ................................................
[CV] ....... linear_svc__C=10, score=0.9691651835602831, total= 1.2min


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 11.6min finished


Best parameters found
{'linear_svc__C': 0.01}


# Make predictions on the test set

In [5]:
predict_y = grid_search.predict(test_x)
print(classification_report(test_y, predict_y, digits=6, target_names=topic_code_to_topic_dict.values()))

                        precision    recall  f1-score   support

CRIME, LAW ENFORCEMENT   0.973603  0.973603  0.973603      6137
  ECONOMIC PERFORMANCE   0.971223  0.972389  0.971806      1666
             ELECTIONS   0.956198  0.962477  0.959327      2132
                HEALTH   0.948012  0.950920  0.949464       978
              RELIGION   0.927885  0.908235  0.917955       425
                SPORTS   0.992028  0.990566  0.991297      6784

             micro avg   0.975775  0.975775  0.975775     18122
             macro avg   0.961491  0.959698  0.960575     18122
          weighted avg   0.975781  0.975775  0.975774     18122

