# Performing SVMs in Python

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
training_path=os.path.join('data', 'wildfires_train.csv')
testing_path=os.path.join('data', 'wildfires_test.csv')

training_data=pd.read_csv(training_path)
testing_data=pd.read_csv(testing_path)

In [3]:
training_data

Unnamed: 0,x,y,temp,humidity,windspd,winddir,rain,days,vulnerable,other,ranger,pre1950,heli,resources,traffic,burned,wlf
0,7.834467,8.306801,99.506964,65.940704,7.614523,W,0.000037,127,1157.377161,0,0,1,0,117.067076,med,791.620319,0
1,2.694922,3.551933,69.887657,31.895045,6.534184,E,0.000040,115,1134.429689,0,1,0,1,127.598019,hi,451.951898,0
2,6.498186,4.106111,91.152930,57.606073,11.580965,SE,0.000041,119,1209.603068,0,0,0,1,132.273679,hi,584.451361,1
3,8.750841,8.887995,54.360593,46.166720,15.383351,E,0.000040,112,1118.691631,0,0,0,0,116.482609,hi,589.681584,1
4,9.200210,9.810147,77.442791,25.490945,7.096639,NW,0.000045,146,1319.237687,0,0,1,0,136.521750,lo,1010.567058,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0.550395,-6.851378,78.875952,37.214804,19.326215,S,0.000037,113,1070.498607,0,1,0,1,126.517723,lo,509.784673,1
346,5.365330,-3.866973,79.373600,42.774894,12.080757,NE,0.000046,128,1273.699945,0,1,0,1,142.413268,med,846.705612,1
347,5.958981,-4.975306,77.864578,44.738565,11.269098,NW,0.000040,122,1133.174647,0,1,0,1,128.566696,med,610.056881,0
348,4.835158,-4.434441,77.095744,34.444851,5.439857,S,0.000046,131,1306.927261,0,1,0,0,138.910058,med,896.484081,0


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC # importing the support vector classifier

In [5]:
x_train=training_data.drop('wlf', axis=1)
y_train=training_data['wlf'].astype(np.bool)
y_train

0      False
1      False
2       True
3       True
4      False
       ...  
345     True
346     True
347    False
348    False
349    False
Name: wlf, Length: 350, dtype: bool

Before training and tuning an SVM I will use a random subset of the data to see how SVM syntax works in the SVCLinear class.

In [6]:
x_train.shape

(350, 16)

It is possible to implement an SVM using the LinearSVC or the SVC by setting the kernel='linear'. I will use the SVC method since it allows for the use of the kernel trick.

Support Vector Machines are sensitive to the scale of the features. For this reason, we are going to engineer the features to have them all on a normal scale. We will create a pipeline to do this for us.

In [7]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 16 columns):
x             350 non-null float64
y             350 non-null float64
temp          350 non-null float64
humidity      350 non-null float64
windspd       350 non-null float64
winddir       350 non-null object
rain          350 non-null float64
days          350 non-null int64
vulnerable    350 non-null float64
other         350 non-null int64
ranger        350 non-null int64
pre1950       350 non-null int64
heli          350 non-null int64
resources     350 non-null float64
traffic       350 non-null object
burned        350 non-null float64
dtypes: float64(9), int64(5), object(2)
memory usage: 43.9+ KB


In [8]:
categorical_attribs=list(x_train.select_dtypes(include=['object']))
categorical_attribs

['winddir', 'traffic']

In [9]:
numerical_attribs=list(x_train.select_dtypes(include=['float64']))
numerical_attribs.append('days')
numerical_attribs

['x',
 'y',
 'temp',
 'humidity',
 'windspd',
 'rain',
 'vulnerable',
 'resources',
 'burned',
 'days']

In [10]:
int_attribs=list(x_train.select_dtypes(include=['int64']))
int_attribs.remove('days')
int_attribs

['other', 'ranger', 'pre1950', 'heli']

In [11]:
from sklearn.compose import ColumnTransformer

In [12]:
full_pipeline=ColumnTransformer([
    ('num', StandardScaler(), numerical_attribs),
    ('int', 'passthrough', int_attribs),
    ('cat', OneHotEncoder(), categorical_attribs)
])

In [13]:
x_train_prepared=full_pipeline.fit_transform(x_train)

In [14]:
pd.DataFrame(x_train_prepared)
# the data should be ready to be fed into an SVM if its accepts categorical variables without one hot encoding

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.058997,1.179363,1.664751,1.604212,-0.605487,-0.979828,-0.193299,-0.601602,0.345383,0.298482,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.004220,0.430835,-0.186346,-0.340026,-0.823074,-0.264558,-0.383529,0.338616,-0.807130,-0.400694,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.782561,0.518076,1.142654,1.128248,0.193379,0.000088,0.239641,0.756067,-0.357553,-0.167636,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1.248567,1.270857,-1.156730,0.474984,0.959202,-0.319820,-0.513994,-0.653785,-0.339806,-0.575488,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.341528,1.416025,0.285822,-0.705743,-0.709792,0.985156,1.148488,1.135341,1.088280,1.405510,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,-0.447857,-1.206890,0.375389,-0.036231,1.753319,-0.900510,-0.913503,0.242166,-0.610901,-0.517224,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
346,0.548207,-0.737075,0.406490,0.281287,0.294040,1.138475,0.770990,1.661345,0.532290,0.356746,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
347,0.671016,-0.911553,0.312182,0.393426,0.130567,-0.206082,-0.393933,0.425102,-0.270672,0.007158,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
348,0.438531,-0.826408,0.264133,-0.194414,-1.043478,1.032457,1.046437,1.348573,0.701191,0.531540,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


### Fitting a Support Vector Classifier with a linear kernel

In [15]:
linear_svm=SVC(C=1,
              kernel='linear',
              random_state=402)

In [16]:
linear_svm.fit(x_train_prepared, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=402,
    shrinking=True, tol=0.001, verbose=False)

Now that I have the mechanism down for training a SVC, I will define a dictionary of hyperparameters for training through RandomizedSearchCV.

In [17]:
from sklearn.model_selection import RandomizedSearchCV

In [18]:
C_train=[]
for i in np.arange(0.01, 1, 0.1):
    C_train.append(i)
C_train

[0.01,
 0.11,
 0.21000000000000002,
 0.31000000000000005,
 0.41000000000000003,
 0.51,
 0.6100000000000001,
 0.7100000000000001,
 0.81,
 0.91]

In [19]:
parameters={'C': C_train}

In [20]:
svc_linear=SVC(kernel='linear')

In [21]:
random_search_linear=RandomizedSearchCV(svc_linear,
                                        parameters,
                                        scoring='accuracy',
                                        n_jobs=-1,
                                        cv=5,
                                        random_state=402)

In [22]:
random_search_linear.fit(x_train_prepared, y_train)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto_deprecated',
                                 kernel='linear', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'C': [0.01, 0.11, 0.21000000000000002,
                                              0.31000000000000005,
                                              0.41000000000000003, 0.51,
                                              0.6100000000000001,
                                              0.7100000000000001, 0.81, 0.91]},
                   pre_dispatch='2*n_jobs', random_state=402, refit=True,
                   ret

Now that the model has been trained, we can view the best model through the best_estimator_ instance variable. Because the hyperparameter of the RandomizedSearchCV class, refit, is True, the best estimator is refit on the entire training data once it is found. This facilitates the process of finding and refitting the model to the entire data.

In [23]:
random_search_linear.best_estimator_

SVC(C=0.7100000000000001, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [24]:
linear_kernel_tuned=random_search_linear.best_estimator_

Now that we have the model ready and tuned, we can evaluate its efficacy on the test data. We only need to transform the test data through the full pipeline defined above and then we are ready.

In [25]:
x_test=testing_data.drop('wlf', axis=1)
y_test=testing_data['wlf'].astype(np.bool)
x_test.head()

Unnamed: 0,x,y,temp,humidity,windspd,winddir,rain,days,vulnerable,other,ranger,pre1950,heli,resources,traffic,burned
0,-5.970035,2.261171,81.717155,65.937444,22.615111,W,4.6e-05,104,1142.188491,0,0,1,0,118.687271,hi,530.958988
1,9.155116,4.493022,45.856767,27.899518,8.138942,E,4e-05,86,1000.744078,0,0,1,0,99.558121,hi,384.629393
2,8.167393,5.607745,66.276722,58.979409,5.556471,N,2.9e-05,89,883.002882,0,0,0,1,103.380896,hi,159.110239
3,5.529441,-4.785655,95.386058,49.083016,7.903764,E,3.9e-05,131,1246.923737,0,0,0,1,139.599418,lo,750.251702
4,-0.642753,-11.483435,95.445092,68.976029,12.66932,SE,4.4e-05,122,1275.687006,0,0,0,0,134.417068,hi,802.996063


When transforming the test data, it is important to only use **pipeline.transform** instead of **pipeline.fit_transform** since we want the model to be evaluated based on the values that were extracted from the training data using the **.fit** method.

In [26]:
x_test_prepared=full_pipeline.transform(x_test)
pd.DataFrame(x_test_prepared).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,-1.796737,0.227639,0.552954,1.604026,2.415723,1.17748,-0.31921,-0.456949,-0.539055,-1.041606,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.332199,0.578985,-1.688188,-0.568197,-0.499866,-0.297252,-1.491752,-2.164829,-1.035559,-2.09037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.127869,0.754469,-0.412016,1.206675,-1.019991,-2.8112,-2.4678,-1.823526,-1.800758,-1.915576,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.582157,-0.881697,1.407209,0.641524,-0.547232,-0.387965,0.549021,1.41012,0.205017,0.53154,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.694684,-1.936085,1.410899,1.77755,0.41258,0.674359,0.787462,0.947432,0.383981,0.007158,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [27]:
# making predictions for the test data
y_test_hat=linear_kernel_tuned.predict(x_test_prepared)
y_test_hat=pd.DataFrame(y_test_hat).rename(columns={0:'wlf_hat'})
y_test_hat

Unnamed: 0,wlf_hat
0,True
1,True
2,False
3,True
4,True
...,...
145,False
146,False
147,False
148,False


In [28]:
from sklearn.metrics import accuracy_score

In [29]:
lin_kernel_score=accuracy_score(y_test, y_test_hat)
lin_kernel_score

0.7666666666666667

The linear kernel had an accuracy of 76.7%. We will compare the polynomial and radial basis function to this baseline.

### Fitting a Support Vector Classifier with a polynomial kernel
The benefit of the polynomial kernel is that we do not suffer from the combinatorial explosion from adding all the polynomial features to the data set.

In [30]:
gamma_tune=np.arange(0.05, 1.0, 0.05).tolist()
for i in gamma_tune:
    round(i, 2)
gamma_tune

[0.05,
 0.1,
 0.15000000000000002,
 0.2,
 0.25,
 0.3,
 0.35000000000000003,
 0.4,
 0.45,
 0.5,
 0.55,
 0.6000000000000001,
 0.6500000000000001,
 0.7000000000000001,
 0.7500000000000001,
 0.8,
 0.8500000000000001,
 0.9000000000000001,
 0.9500000000000001]

In [31]:
poly_params={'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': gamma_tune, 'degree': [2,3,4]}
poly_params

{'C': [0.001, 0.01, 0.1, 1, 10],
 'gamma': [0.05,
  0.1,
  0.15000000000000002,
  0.2,
  0.25,
  0.3,
  0.35000000000000003,
  0.4,
  0.45,
  0.5,
  0.55,
  0.6000000000000001,
  0.6500000000000001,
  0.7000000000000001,
  0.7500000000000001,
  0.8,
  0.8500000000000001,
  0.9000000000000001,
  0.9500000000000001],
 'degree': [2, 3, 4]}

In [32]:
svc_poly=SVC(kernel='poly')

In [33]:
svc_poly_models=RandomizedSearchCV(svc_poly,
                                   poly_params,
                                   n_iter=100,
                                   scoring='accuracy',
                                   n_jobs=-1,
                                   cv=5,
                                   random_state=402)

In [34]:
svc_poly_models.fit(x_train_prepared, y_train)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto_deprecated',
                                 kernel='poly', max_iter=-1, probability=False,
                                 random_state=None, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=100, n_jobs=-1,
                   param_distributions={'C': [0.001, 0....
                                        'degree': [2, 3, 4],
                                        'gamma': [0.05, 0.1,
                                                  0.15000000000000002, 0.2,
                                                  0.25, 0.3,
                                                  0.35000000000000003, 0.4,
                                                  0.45, 0.5, 0.55,
 

In [35]:
svc_poly_models.best_params_

{'gamma': 0.25, 'degree': 2, 'C': 1}

In [36]:
poly_kernel_tuned=svc_poly_models.best_estimator_
poly_kernel_tuned

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma=0.25, kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [37]:
y_poly_hat=poly_kernel_tuned.predict(x_test_prepared)
y_poly_hat=pd.DataFrame(y_poly_hat).rename(columns={0:'wlf_hat'})
y_poly_hat

Unnamed: 0,wlf_hat
0,True
1,True
2,False
3,False
4,True
...,...
145,True
146,True
147,True
148,False


In [38]:
poly_kernel_score=accuracy_score(y_test, y_poly_hat)
poly_kernel_score

0.78

Whereas the SVC with a linear kernel was able to achieve an accuracy of 76.7%, the SVC with a polynomial kernel was able to achieve an accuracy of 78%.

### Fitting a Support Vector Classifier with a radial basis function kernel

In [40]:
rbf_params={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100], 'gamma': gamma_tune}
rbf_params

{'C': [0.001, 0.01, 0.1, 1, 10, 20, 100],
 'gamma': [0.05,
  0.1,
  0.15000000000000002,
  0.2,
  0.25,
  0.3,
  0.35000000000000003,
  0.4,
  0.45,
  0.5,
  0.55,
  0.6000000000000001,
  0.6500000000000001,
  0.7000000000000001,
  0.7500000000000001,
  0.8,
  0.8500000000000001,
  0.9000000000000001,
  0.9500000000000001]}

In [41]:
svc_rbf=SVC(kernel='rbf',
           random_state=402)

In [42]:
svc_rbf_models=RandomizedSearchCV(svc_rbf,
                                 rbf_params,
                                 n_iter=100,
                                 scoring='accuracy',
                                 n_jobs=-1,
                                 cv=5,
                                 random_state=402)

In [43]:
# fitting the models to the prepared training data
svc_rbf_models.fit(x_train_prepared, y_train)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, probability=False,
                                 random_state=402, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=100, n_jobs=-1,
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100],
                                        'gamma': [0.05, 0.1,
                                                  0.15000000000000002, 0.2,
                                                  0.25, 0.3,
                                                  0.35000000000000003, 0.4,
                                                  0.45, 0.5, 0.55,
                                          

Now that the models have been searched over, we can looks at the best parameters that were determined to have the best accuracy.

In [44]:
svc_rbf_models.best_params_

{'gamma': 0.05, 'C': 10}

Let's take a look at how these compare to the best parameters that we obtain doing full grid search since our hyperparameter space is not that large.

I will use the Sci-Kit Learn GridSearchCV class.

In [46]:
from sklearn.model_selection import GridSearchCV

In [49]:
svc_full_search=GridSearchCV(svc_rbf,
                            rbf_params,
                            scoring='accuracy',
                            n_jobs=-1,
                            cv=5)

In [50]:
svc_full_search.fit(x_train_prepared, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=402, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100],
                         'gamma': [0.05, 0.1, 0.15000000000000002, 0.2, 0.25,
                                   0.3, 0.35000000000000003, 0.4, 0.45, 0.5,
                                   0.55, 0.6000000000000001, 0.6500000000000001,
                                   0.7000000000000001, 0.7500000000000001, 0.8,
                                   0.8500000000000001, 0.9000000000000001,
                                   0.9500000000000001]},
             pre_dispatch='2*n_jobs',

In [52]:
print('Random search best parameters:', svc_rbf_models.best_params_, 
      '\nFull search best parameters', svc_full_search.best_params_)

Random search best parameters: {'gamma': 0.05, 'C': 10} 
Full search best parameters {'C': 10, 'gamma': 0.05}


Here we can see that both Randomized grid search and full grid search resulted in the same hyperparameters. I will now evaluate my best estimator's performace on the test data.

In [53]:
rbf_kernel_tuned=svc_rbf_models.best_estimator_
rbf_kernel_tuned

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.05, kernel='rbf',
    max_iter=-1, probability=False, random_state=402, shrinking=True, tol=0.001,
    verbose=False)

In [59]:
y_rbf_hat=rbf_kernel_tuned.predict(x_test_prepared)
y_rbf_hat

array([ True,  True, False,  True,  True, False,  True,  True,  True,
        True, False, False,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True, False, False, False,
       False,  True, False,  True,  True, False, False, False,  True,
       False, False,  True, False,  True, False, False, False, False,
       False,  True,  True,  True, False, False,  True,  True, False,
       False,  True, False,  True,  True,  True,  True,  True,  True,
       False, False, False,  True,  True, False, False,  True, False,
        True,  True,  True, False, False,  True,  True,  True,  True,
       False,  True,  True, False, False, False, False,  True,  True,
        True, False,  True,  True, False, False, False,  True,  True,
       False,  True,  True,  True, False,  True, False,  True, False,
       False,  True,  True, False,  True, False,  True, False,  True,
        True, False, False,  True,  True, False, False, False, False,
       False, False,

In [61]:
y_rbf_hat=pd.DataFrame(y_rbf_hat).rename(columns={0: 'wlf_hat'})
y_rbf_hat

Unnamed: 0,wlf_hat
0,True
1,True
2,False
3,True
4,True
...,...
145,True
146,True
147,True
148,False


In [62]:
rbf_kernel_score=accuracy_score(y_test, y_rbf_hat)
rbf_kernel_score

0.8

I fit three different Support Vector Classifiers onto the wildfire data. I used a linear, polynomial, and radial basis function kernel. Each resulted in a decent accuracy score. The rbf kernel, however, resulted in the highest of the three.

In [68]:
names=['Linear kernel score', 'Polynomial kernel score', 'RBF kernel score']
scores=[lin_kernel_score, poly_kernel_score, rbf_kernel_score]

for name,score in zip(names, scores):
    print(name + ': ' + str(round(score, 2)))

Linear kernel score: 0.77
Polynomial kernel score: 0.78
RBF kernel score: 0.8
