Task 4.2 - Support Vector Machine 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [3]:
ds = pd.read_csv('data/covtype.csv')

In [19]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Elevation                           581012 non-null  float64
 1   Aspect                              581012 non-null  float64
 2   Slope                               581012 non-null  float64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  float64
 4   Vertical_Distance_To_Hydrology      581012 non-null  float64
 5   Horizontal_Distance_To_Roadways     581012 non-null  float64
 6   Hillshade_9am                       581012 non-null  float64
 7   Hillshade_Noon                      581012 non-null  float64
 8   Hillshade_3pm                       581012 non-null  float64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float64
 10  Wilderness_Area1                    581012 non-null  int64  
 11  Wilderness_Area2          

### Scaling Numerical Data

In [4]:
scale = StandardScaler()

ds.columns.get_loc('Cover_Type') # finding index of dependent variable 

numerical_features = ds.iloc[:,0:10]
scaled_num_features =  pd.DataFrame(scale.fit_transform(numerical_features), columns = numerical_features.columns)

ds = pd.concat([scaled_num_features, ds.iloc[:,10:]], axis = 1)

This dataset has 581,012 rows, which in its entirety has been computationally and time intensive to train models from. We can randomly sample the data and train using a more manageable dataset: 

In [9]:
#ds2 = ds.sample(frac = .3, random_state = 650) #Sampling 20% of the original dataset 

### Train/Test Split 

In [5]:
#Subsetting feature
X = ds.drop(ds.columns[-1], axis = 1)
y = ds[ds.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 650, test_size = 0.3)

0         5
1         5
2         2
3         2
4         5
         ..
581007    3
581008    3
581009    3
581010    3
581011    3
Name: Cover_Type, Length: 581012, dtype: int64


### Support Vector Machine - Model 1 

In [12]:
#model_svc = SVC(kernel = 'linear')
#model_svc.fit(X_train, y_train)

The training attempt above takes an exceedingly long time. Per the [sci-kit learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html):

*"For large datasets consider using LinearSVC or SGDClassifier instead, possibly after a Nystroem transformer or other Kernel Approximation"*

In [6]:
from sklearn.svm import LinearSVC

model_lsvc = LinearSVC(random_state = 650)
model_lsvc.fit(X_train,y_train) 



In [7]:
#predicting forest type on test data using first linear SVC model:  
lsvc_predict = model_lsvc.predict(X_test)

#print accuracy score: 
print(accuracy_score(lsvc_predict, y_test))

0.7115958325683863


### Hyperparameter Tuning with RandomizedSearchCV 
(Using RandomizezdSearchCV over GridSearchCV due to size of dataset)

In [9]:
from scipy.stats import loguniform, randint


param_dist = {
    'C': loguniform(.0001, 10000),  #choosing a wide range for C 
    'max_iter': randint(1000, 5000) # randomly sampling between 1000 and 5000
}

In [15]:
from sklearn.model_selection import RandomizedSearchCV

n_iter_search = 5
random_search = RandomizedSearchCV(model_lsvc, 
                                   param_distributions = param_dist, 
                                   n_iter = n_iter_search, 
                                   n_jobs =4, 
                                   cv = 3, 
                                   random_state = 650)

In [16]:
random_search.fit(X_train, y_train)



In [17]:
random_search.best_params_

{'C': 14.367136335397122, 'max_iter': 1902}

In [21]:
best_lsvc = LinearSVC(C = random_search.best_params_['C'], max_iter = random_search.best_params_['max_iter'], random_state = 650)

In [22]:
best_lsvc.fit(X_train,y_train)




In [23]:
#predicting forest type on test data using first linear SVC model:  
lsvc_predict2 = best_lsvc.predict(X_test)

In [24]:
print(accuracy_score(lsvc_predict2, y_test)) #this randomized grid search produced results that were slightly worse than default hp settings. 
# the search space might not have included the best values. 

0.7113950339636497


### KBest feature selection

In [25]:
from sklearn.feature_selection import SelectKBest

k_best_selector = SelectKBest(k = 15)

In [26]:
k_best_selector.fit(X = X, y = y)

In [27]:
print(X.columns[k_best_selector.get_support()])

Index(['Elevation', 'Slope', 'Horizontal_Distance_To_Roadways',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type6', 'Soil_Type10', 'Soil_Type38', 'Soil_Type39',
       'Soil_Type40'],
      dtype='object')


In [28]:
X_train_kbest = k_best_selector.fit_transform(X_train, y_train)
X_test_kbest = k_best_selector.transform(X_test)

In [29]:
model_lsvc.fit(X_train_kbest, y_train)



In [30]:
#predicting forest type on test data using first linear SVC model:  
lsvc_predict3 = model_lsvc.predict(X_test_kbest)

#print accuracy score: 
print(accuracy_score(lsvc_predict3, y_test))

#accuracy = .68, worse performance than including all variables

0.6787910776574261
