## SVM - baseline classifier

Train SVM on preprocessed data to classify hand-drawn pictures

In [0]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 25 15:31:41 2018

@author: dmitriitiron
"""

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVC

# upload the pre-processed data
xTrain = np.load('data/xTrain.npy')
xTest = np.load('data/xTest.npy')
yTrain = np.load('data/yTrain.npy')

In [2]:

linearSVM_clf = LinearSVC()
# set up the parameters to be tested
Cs = [0.5,0.75,1,2,5,10,20]
tols = [1e-5,1e-4,1e-3,1e-2,1e-1,1]
max_iters = [100,500,1000]
params_grid = {'C':Cs,'tol':tols,'max_iter':max_iters} 

# convert to proper vectorformat
yTrain = yTrain.ravel()

# set up the training env
clf_svm = RandomizedSearchCV(linearSVM_clf,params_grid,cv=5,scoring = 'accuracy', verbose=5, n_iter=50, n_jobs = 10)
   

clf_svm.fit(xTrain, yTrain)

# compute the  preditions
y_test_pred_svm = clf_svm.predict(xTest)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] tol=1e-05, max_iter=100, C=5 ....................................
[CV] tol=1e-05, max_iter=100, C=5 ....................................
[CV] tol=1e-05, max_iter=100, C=5 ....................................
[CV] tol=1e-05, max_iter=100, C=5 ....................................
[CV] tol=1e-05, max_iter=100, C=5 ....................................
[CV] tol=1e-05, max_iter=500, C=10 ...................................
[CV] tol=1e-05, max_iter=500, C=10 ...................................
[CV] tol=1e-05, max_iter=500, C=10 ...................................
[CV] tol=1e-05, max_iter=500, C=10 ...................................
[CV] tol=1e-05, max_iter=500, C=10 ...................................
[CV]  tol=1e-05, max_iter=100, C=5, score=0.2945851962245405, total= 1.1min
[CV] tol=1e-05, max_iter=500, C=0.75 .................................
[CV]  tol=1e-05, max_iter=100, C=5, score=0.3238664673642252, total= 1.1min
[CV] 

[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed: 16.2min


[CV]  tol=1, max_iter=500, C=0.75, score=0.3224043715846995, total= 2.1min
[CV] tol=0.001, max_iter=500, C=5 ....................................
[CV]  tol=1, max_iter=500, C=0.75, score=0.32112393376818865, total= 2.1min
[CV] tol=0.001, max_iter=500, C=5 ....................................
[CV]  tol=1, max_iter=500, C=0.75, score=0.32326283987915405, total= 2.1min
[CV] tol=0.001, max_iter=500, C=5 ....................................
[CV]  tol=0.01, max_iter=1000, C=1, score=0.3213393303348326, total= 4.7min
[CV] tol=0.001, max_iter=100, C=2 ....................................
[CV]  tol=0.01, max_iter=1000, C=1, score=0.336322869955157, total= 4.8min
[CV] tol=0.001, max_iter=100, C=2 ....................................
[CV]  tol=0.01, max_iter=1000, C=1, score=0.31296572280178836, total= 4.9min
[CV] tol=0.001, max_iter=100, C=2 ....................................
[CV]  tol=0.01, max_iter=1000, C=1, score=0.3156046161565479, total= 4.8min
[CV] tol=0.001, max_iter=100, C=2 .........

[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed: 36.9min


[CV] tol=0.001, max_iter=100, C=0.5 ..................................
[CV]  tol=0.001, max_iter=500, C=20, score=0.27136431784107945, total= 3.1min
[CV] tol=0.001, max_iter=100, C=0.5 ..................................
[CV]  tol=0.001, max_iter=500, C=20, score=0.2779729051680883, total= 3.1min
[CV] tol=0.001, max_iter=100, C=0.5 ..................................
[CV]  tol=0.001, max_iter=500, C=20, score=0.2764350453172205, total= 3.1min
[CV] tol=0.001, max_iter=100, C=0.5 ..................................
[CV]  tol=0.001, max_iter=100, C=0.5, score=0.3537618335824614, total= 1.1min
[CV] tol=1e-05, max_iter=1000, C=5 ...................................
[CV]  tol=0.001, max_iter=100, C=0.5, score=0.34078489816194735, total= 1.1min
[CV] tol=1e-05, max_iter=1000, C=5 ...................................
[CV]  tol=0.001, max_iter=100, C=0.5, score=0.34632683658170915, total= 1.1min
[CV] tol=1e-05, max_iter=1000, C=5 ...................................
[CV]  tol=0.001, max_iter=100, C=0.

[Parallel(n_jobs=10)]: Done 250 out of 250 | elapsed: 66.7min finished


In [5]:
# reporting 

import pandas as pd
print(str(clf_svm.best_estimator_))
print(str(clf_svm.best_score_))
print(str(clf_svm.cv_results_))
results = pd.DataFrame.from_dict(clf_svm.cv_results_)
results.to_csv('SVM_full_results.csv')


LinearSVC(C=0.5, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
0.3432
{'mean_fit_time': array([ 68.85441036, 181.47541647, 174.29303937, 283.01408358,
       177.94847579,  73.52338061,  68.24266481, 175.63669319,
       175.83190336, 175.97152023, 288.62689734, 123.45310163,
       181.75325618,  71.09284263, 181.75175152,  72.55642204,
       175.95001111,  68.40260205, 292.00880179,  71.49881706,
       281.38040566, 175.87063847,  74.10264912,  71.47148752,
       198.17957454, 178.69788074, 187.43344665,  71.87526298,
        69.37543139, 282.20185657,  66.7885715 , 279.97218385,
       173.63937244, 172.52122459, 280.49846268, 230.20341039,
        73.59035454,  70.50704727,  73.61412177, 304.23734045,
       173.20315824,  69.57934217, 172.97287183, 296.11231785,
        72.70428243, 279.35541348, 288.2058197 ,  67.1172338 ,
      



In [6]:
categories=["sink","pear","moustache","nose","skateboard","penguin","peanut","skull","panda","paintbrush","nail","apple","rifle","mug","sailboat","pineapple","spoon","rabbit","shovel","rollerskates","screwdriver","scorpion","rhinoceros","pool","octagon","pillow","parrot","squiggle","mouth","empty","pencil"]
categories_dict = dict(enumerate(categories))
print(categories_dict)

{0: 'sink', 1: 'pear', 2: 'moustache', 3: 'nose', 4: 'skateboard', 5: 'penguin', 6: 'peanut', 7: 'skull', 8: 'panda', 9: 'paintbrush', 10: 'nail', 11: 'apple', 12: 'rifle', 13: 'mug', 14: 'sailboat', 15: 'pineapple', 16: 'spoon', 17: 'rabbit', 18: 'shovel', 19: 'rollerskates', 20: 'screwdriver', 21: 'scorpion', 22: 'rhinoceros', 23: 'pool', 24: 'octagon', 25: 'pillow', 26: 'parrot', 27: 'squiggle', 28: 'mouth', 29: 'empty', 30: 'pencil'}


In [0]:
# export the predictions
import csv
with open("SVM_predictions.csv","w") as f:
   writer = csv.writer(f)
   id = 0
   for line in y_test_pred_svm:
      result = [id,categories_dict[line]]
      writer.writerow(result)
      id += 1

We got 46% accuracy on the real test set provided by Kaggle, which is very good for a baseline classifier. The main reason for this is that the pictures are well preprocessed, which significantly improves the performace of our model.