# SVM

In [14]:
import pandas as pd
import numpy as np
import pickle
import time

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, plot_confusion_matrix, confusion_matrix

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE

from category_encoders import TargetEncoder

import warnings
warnings.filterwarnings('ignore')

In [15]:
X_train = pd.read_csv('Data/X_train.csv', index_col='building_id')
X_test = pd.read_csv('Data/X_test.csv', index_col='building_id')
y_train = pd.read_csv('Data/y_train.csv', index_col='building_id')
y_test = pd.read_csv('Data/y_test.csv', index_col='building_id')

In [16]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [17]:
#Create a dictionary and helper function to track model scores.
score_dict = {}
def print_scores():
    for key in score_dict.keys():
        print(f'{key}, f1_micro_score: {round(score_dict[key][0],4)}, Run time: {round(score_dict[key][1],0)}')

### Divide Columns

In [18]:
binary_cols = []
for col in X_train.columns:
    if col.startswith('has'):
        binary_cols.append(col)

cat_cols = list(X_train.select_dtypes(include='object').columns)

integer_cols = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

all_cols = geo_cols + cat_cols + integer_cols + binary_cols

### Sample X_train, y_train for faster modeling

In [19]:
X_train_sample, X_train_leftover, y_train_sample, y_train_leftover = \
    train_test_split(X_train, y_train, test_size=.9, random_state=42)

### svm0:  Basline SVM with no feature engineering

In [21]:
ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

transformer = ColumnTransformer([
    ('binary', 'passthrough', binary_cols),
    ('categorical', ohe_pipe, cat_cols),
    ('integer', 'passthrough', integer_cols),
    ('geo', 'passthrough', geo_cols)
])

In [34]:
svm0_pipe = imbPipeline([
    ('trans', transformer), 
    ('svm', SVC(random_state=42))
])

In [23]:
start = time.time()
f1_micro_svm0 = cross_val_score(svm0_pipe, X_train_sample, y_train_sample, scoring='f1_micro')
end = time.time()
print(f'Run time: {end-start}')
run_time_svm0 = (end-start)

Run time: 151.94065284729004


In [24]:
f1_micro_svm0 = f1_micro_svm0.mean()
f1_micro_svm0

0.5687899718598107

In [25]:
score_dict['svm0'] = [f1_micro_svm0, run_time_svm0]
print_scores()

svm0, f1_micro_score: 0.5688, Run time: 152.0


### svm1:  log transform and scale integer columns, remove unimportant features

In [26]:
binary_cols_dropped = binary_cols.copy()
for col in binary_cols_dropped:
    if col.startswith('has_secondary'):
        binary_cols_dropped.remove(col)
binary_cols_dropped.append('has_secondary_use')

cat_cols_dropped = cat_cols.copy()
cat_cols_dropped.remove('legal_ownership_status')
cat_cols_dropped.remove('plan_configuration')

In [27]:
def log_transform(x):
    return np.log(x+1)

In [28]:
function_transformer = FunctionTransformer(log_transform)

ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
integer_pipe = imbPipeline([
    ('function', function_transformer),
    ('ss', StandardScaler())
])

transformer = ColumnTransformer([
    ('binary', 'passthrough', binary_cols_dropped),
    ('categorical', ohe_pipe, cat_cols_dropped),
    ('geo', 'passthrough', geo_cols),
    ('integer', integer_pipe, integer_cols),
])

In [30]:
svm1_pipe = imbPipeline([
    ('transformer', transformer),
    ('svm', SVC(random_state=42))
])

In [31]:
start = time.time()
f1_micro_svm1 = cross_val_score(svm1_pipe, X_train_sample, y_train_sample, scoring='f1_micro')
end = time.time()
print(f'Run time: {end-start}')
run_time_svm1 = end-start

Run time: 113.17395806312561


In [32]:
f1_micro_svm1 = f1_micro_svm1.mean()
f1_micro_svm1

0.5687899718598107

In [33]:
score_dict['svm1'] = [f1_micro_svm1, run_time_svm1]
print_scores()

svm0, f1_micro_score: 0.5688, Run time: 152.0
svm1, f1_micro_score: 0.5688, Run time: 113.0


### svm2:  Add Target Encoding

In [35]:
function_transformer = FunctionTransformer(log_transform)

ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
integer_pipe = imbPipeline([
    ('function', function_transformer),
    ('ss', StandardScaler())
])
target_pipe = imbPipeline([('target', TargetEncoder(cols=geo_cols))])

transformer = ColumnTransformer([
    ('binary', 'passthrough', binary_cols_dropped),
    ('categorical', ohe_pipe, cat_cols_dropped),
    ('geo', target_pipe, geo_cols),
    ('integer', integer_pipe, integer_cols),
])

In [36]:
svm2_pipe = imbPipeline([
    ('transformer', transformer),
    ('svm', SVC(random_state=42))
])

In [37]:
start = time.time()
f1_micro_svm2 = cross_val_score(svm2_pipe, X_train_sample, y_train_sample, scoring='f1_micro')
end = time.time()
print(f'Run time: {end-start}')
run_time_svm2 = end-start

Run time: 79.62519788742065


In [38]:
f1_micro_svm2 = f1_micro_svm2.mean()
f1_micro_svm2

0.6871322588897416

In [39]:
score_dict['svm2'] = [f1_micro_svm2, run_time_svm2]
print_scores()

svm0, f1_micro_score: 0.5688, Run time: 152.0
svm1, f1_micro_score: 0.5688, Run time: 113.0
svm2, f1_micro_score: 0.6871, Run time: 80.0


### GridsearchCV

In [None]:
start = time.time()
svm_param_grid = {'svm__C': [.001, 1, 100, 1000],
                  'svm__gamma': [0.001, 1, 100]}

svm_gs = GridSearchCV(estimator=svm2_pipe, param_grid=svm_param_grid, 
                          scoring='f1_micro', cv=3, n_jobs=-1, verbose=2)
svm_gs.fit(X_train_sample, y_train_sample)
end = time.time()
print(f'Run time: {end-start}')