# Performing SVMs in Python

In [139]:
import os
import numpy as np
import pandas as pd

In [140]:
training_path=os.path.join('data', 'wildfires_train.csv')
testing_path=os.path.join('data', 'wildfires_test.csv')

training_data=pd.read_csv(training_path)
testing_data=pd.read_csv(testing_path)

In [141]:
training_data

Unnamed: 0,x,y,temp,humidity,windspd,winddir,rain,days,vulnerable,other,ranger,pre1950,heli,resources,traffic,burned,wlf
0,7.834467,8.306801,99.506964,65.940704,7.614523,W,0.000037,127,1157.377161,0,0,1,0,117.067076,med,791.620319,0
1,2.694922,3.551933,69.887657,31.895045,6.534184,E,0.000040,115,1134.429689,0,1,0,1,127.598019,hi,451.951898,0
2,6.498186,4.106111,91.152930,57.606073,11.580965,SE,0.000041,119,1209.603068,0,0,0,1,132.273679,hi,584.451361,1
3,8.750841,8.887995,54.360593,46.166720,15.383351,E,0.000040,112,1118.691631,0,0,0,0,116.482609,hi,589.681584,1
4,9.200210,9.810147,77.442791,25.490945,7.096639,NW,0.000045,146,1319.237687,0,0,1,0,136.521750,lo,1010.567058,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0.550395,-6.851378,78.875952,37.214804,19.326215,S,0.000037,113,1070.498607,0,1,0,1,126.517723,lo,509.784673,1
346,5.365330,-3.866973,79.373600,42.774894,12.080757,NE,0.000046,128,1273.699945,0,1,0,1,142.413268,med,846.705612,1
347,5.958981,-4.975306,77.864578,44.738565,11.269098,NW,0.000040,122,1133.174647,0,1,0,1,128.566696,med,610.056881,0
348,4.835158,-4.434441,77.095744,34.444851,5.439857,S,0.000046,131,1306.927261,0,1,0,0,138.910058,med,896.484081,0


In [166]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC # importing the support vector classifier

In [143]:
x_train=training_data.drop('wlf', axis=1)
y_train=training_data['wlf'].astype(np.bool)
y_train

0      False
1      False
2       True
3       True
4      False
       ...  
345     True
346     True
347    False
348    False
349    False
Name: wlf, Length: 350, dtype: bool

Before training and tuning an SVM I will use a random subset of the data to see how SVM syntax works in the SVCLinear class.

In [144]:
x_train.shape

(350, 16)

It is possible to implement an SVM using the LinearSVC or the SVC by setting the kernel='linear'. I will use the SVC method since it allows for the use of the kernel trick.

Support Vector Machines are sensitive to the scale of the features. For this reason, we are going to engineer the features to have them all on a normal scale. We will create a pipeline to do this for us.

In [145]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 16 columns):
x             350 non-null float64
y             350 non-null float64
temp          350 non-null float64
humidity      350 non-null float64
windspd       350 non-null float64
winddir       350 non-null object
rain          350 non-null float64
days          350 non-null int64
vulnerable    350 non-null float64
other         350 non-null int64
ranger        350 non-null int64
pre1950       350 non-null int64
heli          350 non-null int64
resources     350 non-null float64
traffic       350 non-null object
burned        350 non-null float64
dtypes: float64(9), int64(5), object(2)
memory usage: 43.9+ KB


In [146]:
categorical_attribs=list(x_train.select_dtypes(include=['object']))
categorical_attribs

['winddir', 'traffic']

In [147]:
numerical_attribs=list(x_train.select_dtypes(include=['float64']))
numerical_attribs

['x',
 'y',
 'temp',
 'humidity',
 'windspd',
 'rain',
 'vulnerable',
 'resources',
 'burned']

In [148]:
int_attribs=list(x_train.select_dtypes(include=['int64']))
int_attribs

['days', 'other', 'ranger', 'pre1950', 'heli']

In [149]:
from sklearn.compose import ColumnTransformer

In [150]:
numerical_pipeline=Pipeline([
    ('scaler', StandardScaler())
])

full_pipeline=ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_attribs),
    ('int', 'passthrough', int_attribs),
    ('cat', OneHotEncoder(), categorical_attribs)
])

In [151]:
x_train_prepared=full_pipeline.fit_transform(x_train)

In [156]:
pd.DataFrame(x_train_prepared)
# the data should be ready to be fed into an SVM if its accepts categorical variables without one hot encoding

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.058997,1.179363,1.664751,1.604212,-0.605487,-0.979828,-0.193299,-0.601602,0.345383,127.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.004220,0.430835,-0.186346,-0.340026,-0.823074,-0.264558,-0.383529,0.338616,-0.807130,115.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.782561,0.518076,1.142654,1.128248,0.193379,0.000088,0.239641,0.756067,-0.357553,119.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1.248567,1.270857,-1.156730,0.474984,0.959202,-0.319820,-0.513994,-0.653785,-0.339806,112.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.341528,1.416025,0.285822,-0.705743,-0.709792,0.985156,1.148488,1.135341,1.088280,146.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,-0.447857,-1.206890,0.375389,-0.036231,1.753319,-0.900510,-0.913503,0.242166,-0.610901,113.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
346,0.548207,-0.737075,0.406490,0.281287,0.294040,1.138475,0.770990,1.661345,0.532290,128.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
347,0.671016,-0.911553,0.312182,0.393426,0.130567,-0.206082,-0.393933,0.425102,-0.270672,122.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
348,0.438531,-0.826408,0.264133,-0.194414,-1.043478,1.032457,1.046437,1.348573,0.701191,131.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [169]:
linear_svm=SVC(C=1,
              kernel='linear',
              random_state=402)

In [170]:
linear_svm.fit(x_train_prepared, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=402,
    shrinking=True, tol=0.001, verbose=False)