# SVM

In [11]:
%load_ext autoreload
%autoreload 2
from preprocessing import *
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from joblib import dump, load

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
SCORINGS = ["f1", "roc_auc", "accuracy", "recall", "precision"]
METRIC = "roc_auc"

def tabla(grid, params):
    tabla = pd.DataFrame(grid.cv_results_)
    tabla.sort_values("rank_test_" + METRIC, inplace = True)
    tabla.reset_index(inplace = True)
    cols = ["param_svc__" + x for x in params] + ["mean_test_" + x for x in SCORINGS]
    return tabla[cols]

## SVM Lineal/Polinómico (Grado 1, 2 y 3)

In [2]:
initialize_dataset()
df_features = pd.read_csv("datasets/df_features.csv", low_memory = False, index_col = "id")
df_target = pd.read_csv("datasets/df_target.csv", low_memory=False, index_col = "id")

initialize_dataset()
common(df_features, df_target)
viento_trigonometrico(df_features)
# El barrio tiene 49 valores distintos. Para no tener que hacer one hoy con 48 columnas nuevas, uso hashing trick
df_features = hashing_trick(df_features, 24, "barrio")
pipe = standarizer()
pipe = simple_imputer(pipe)

In [3]:
pipe.steps.append(('svc', SVC(kernel = 'poly', random_state = 123, max_iter=100000)))

# Usamos Grid search y cross validation. Decidimos en este caso usar el valor de gamma por defecto, ya que el entrenamiento tarda muchísimo
grid = GridSearchCV(pipe, param_grid = {"svc__C": [0.01, 1, 1000], "svc__coef0": [1, 1000], "svc__degree": [1, 2, 3]},
                    verbose = 1, n_jobs = -1, cv = StratifiedKFold(3), scoring = SCORINGS, refit = METRIC)

grid.fit(df_features, df_target.values.ravel())

grid.best_score_


grid2 = GridSearchCV(pipe, param_grid = {"svc__C": [0.01, 1, 1000], "svc__gamma": [0.00001, 0.001, 1]}, \
                     verbose = 1, n_jobs = -1, cv = StratifiedKFold(3), scoring = SCORINGS, refit = METRIC)

grid2.fit(df_features, df_target.values.ravel())

grid2.best_score_

Fitting 3 folds for each of 18 candidates, totalling 54 fits




0.8605307914369759

In [4]:
grid.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('imputer', SimpleImputer()),
                ('svc',
                 SVC(C=1, coef0=1, kernel='poly', max_iter=100000,
                     random_state=123))])

In [5]:
tabla(grid, ["C", "degree", "coef0"])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__coef0,param_svc__degree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,795.337845,14.53512,171.472853,0.921413,0.01,1,1,"{'svc__C': 0.01, 'svc__coef0': 1, 'svc__degree...",0.838859,0.835928,0.836626,0.837138,0.00125,8
1,696.578876,143.916998,159.778714,2.989079,0.01,1,2,"{'svc__C': 0.01, 'svc__coef0': 1, 'svc__degree...",0.848386,0.845689,0.846242,0.846772,0.001163,5
2,476.680345,2.881491,153.186601,4.011418,0.01,1,3,"{'svc__C': 0.01, 'svc__coef0': 1, 'svc__degree...",0.853897,0.850643,0.852281,0.852274,0.001328,3
3,711.935043,123.215224,172.65603,7.846485,0.01,1000,1,"{'svc__C': 0.01, 'svc__coef0': 1000, 'svc__deg...",0.838859,0.835928,0.836626,0.837138,0.00125,8
4,1386.014717,2.813646,157.530237,3.291116,0.01,1000,2,"{'svc__C': 0.01, 'svc__coef0': 1000, 'svc__deg...",0.853458,0.849324,0.850346,0.851043,0.001758,4
5,2316.94877,102.230761,161.319584,11.132196,0.01,1000,3,"{'svc__C': 0.01, 'svc__coef0': 1000, 'svc__deg...",0.623926,0.417553,0.538608,0.526696,0.084671,17
6,531.380732,15.701188,160.803339,5.732261,1.0,1,1,"{'svc__C': 1, 'svc__coef0': 1, 'svc__degree': 1}",0.847008,0.843256,0.842167,0.844144,0.002074,6
7,599.803711,14.784568,140.492652,3.529678,1.0,1,2,"{'svc__C': 1, 'svc__coef0': 1, 'svc__degree': 2}",0.859086,0.856096,0.858437,0.857873,0.001284,2
8,1168.282834,54.802312,134.863666,2.364517,1.0,1,3,"{'svc__C': 1, 'svc__coef0': 1, 'svc__degree': 3}",0.861431,0.859496,0.860665,0.860531,0.000796,1
9,517.97352,3.409891,152.475457,2.461987,1.0,1000,1,"{'svc__C': 1, 'svc__coef0': 1000, 'svc__degree...",0.847038,0.843286,0.842108,0.844144,0.002102,7


Podemos ver que hay varios modelos en los que no terminó de converger tras 100000 iteraciones. Estos son en su mayoría los que tienen C=1000 (se puede notar en la tabla viendo que su score promedio en el fold es pésimo). También dieron bastante mal los polinómicos de grado 2 y 3 que tenían al término independiente (coef0) en 1000 y C=1.

El mejor en este caso resultó ser el del kernel de grado 3, C=1 y coef0=1. Dio un accuracy de alrededor de 86,1% en promedio en sus folds. En segundo lugar esta el de grado 2 con los mismos parámetros, con accuracy de alrededor de 85,8%. Es una diferencia relativamente considerable, asi que vamos a quedarnos con el de grado 3 aunque el de grado 2 sea más simple. Confiamos que no está overfitteado al haber hecho cross validation, y esta menos sesgado que el de grado 2.

Los siguientes puestos ya bajan el accuracy considerablemente, siendo el tercero de 85,2%.

Nos guardamos el modelo en un archivo para evitar reentrenarlo

In [8]:
dump(grid.best_estimator_, 'SVM/polinomico.joblib') 

['SVM/polinomico.joblib']

## SVM Radial (RBF)

In [49]:
initialize_dataset()
df_features = pd.read_csv("datasets/df_features.csv", low_memory = False, index_col = "id")
df_target = pd.read_csv("datasets/df_target.csv", low_memory=False, index_col = "id")

initialize_dataset()
common(df_features, df_target)
viento_trigonometrico(df_features)
# El barrio tiene 49 valores distintos. Para no tener que hacer one hoy con 48 columnas nuevas, uso hashing trick
df_features = hashing_trick(df_features, 24, "barrio")
pipe = standarizer()
pipe = simple_imputer(pipe)

In [50]:
pipe.steps.append(('svc', SVC(kernel = 'rbf', random_state = 123, max_iter=100000)))

grid2 = GridSearchCV(pipe, param_grid = {"svc__C": [0.01, 1, 1000], "svc__gamma": [0.00001, 0.001, 1]}, \
                     verbose = 1, n_jobs = -1, cv = StratifiedKFold(3), scoring = SCORINGS, refit = METRIC)

grid2.fit(df_features, df_target.values.ravel())

grid2.best_score_

Fitting 3 folds for each of 9 candidates, totalling 27 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.8715856078362467

In [51]:
tabla(grid2, ["C", "gamma"])

Unnamed: 0,param_svc__C,param_svc__gamma,mean_test_f1,mean_test_roc_auc,mean_test_accuracy,mean_test_recall,mean_test_precision
0,1.0,0.001,0.573597,0.871586,0.847671,0.45762,0.768315
1,1000.0,0.001,0.631816,0.865608,0.857404,0.547048,0.748593
2,1000.0,1e-05,0.572647,0.86523,0.844994,0.463861,0.748096
3,0.01,0.001,0.013073,0.85816,0.777297,0.00659,0.842424
4,1.0,1e-05,0.01873,0.857486,0.777864,0.009471,0.852849
5,0.01,1e-05,0.0,0.856651,0.776114,0.0,0.0
6,1.0,1.0,0.008413,0.786613,0.776554,0.004234,0.666162
7,0.01,1.0,0.0,0.785709,0.776114,0.0,0.0
8,1000.0,1.0,0.026044,0.784073,0.776359,0.013355,0.521601


En este caso caso la mayoría de los parámetros no logró converger antes de las iteraciones dadas. Casi todos los modelos dieron 77% de accuracy, lo cual es muy malo ya que probablemente hayan fiteado a decir siempre que "no" al estar desbalanceada la variable target. El hecho de que haya algunos con recall y precision en 0 da a entender lo mismo (0 true positives, entonces 0 de recall y precision).

Los primeros 3 si dieron valores más aceptables, siendo el de mejor el que usa C=1 y Gamma=0,001 (0.872 de área bajo curva ROC y accuracy del 84,8%).

Aún así, dio peor que el kernel polinómico, y el recall es bastante malo para el primer modelo, lo cual quiere decir que tiene muchos falsos negativos. El segundo modelo es levemente mejor en ese aspecto, aunque tiene peor área bajo la curva ROC. Como el dataset esta desbalanceado, tiende más a la negativa por lo que es esperable que tenga mejor precisión que recall.