Task 4.2 - Support Vector Machine 

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score,cohen_kappa_score, roc_auc_score, roc_curve, classification_report


In [93]:
ds = pd.read_csv('data/covtype.csv')

In [19]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Elevation                           581012 non-null  float64
 1   Aspect                              581012 non-null  float64
 2   Slope                               581012 non-null  float64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  float64
 4   Vertical_Distance_To_Hydrology      581012 non-null  float64
 5   Horizontal_Distance_To_Roadways     581012 non-null  float64
 6   Hillshade_9am                       581012 non-null  float64
 7   Hillshade_Noon                      581012 non-null  float64
 8   Hillshade_3pm                       581012 non-null  float64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  float64
 10  Wilderness_Area1                    581012 non-null  int64  
 11  Wilderness_Area2          

### Scaling Numerical Data

In [94]:
scale = StandardScaler()

ds.columns.get_loc('Cover_Type') # finding index of dependent variable 

numerical_features = ds.iloc[:,0:10]
scaled_num_features =  pd.DataFrame(scale.fit_transform(numerical_features), columns = numerical_features.columns)

ds = pd.concat([scaled_num_features, ds.iloc[:,10:]], axis = 1)

### Train/Test Split 

In [95]:
#Subsetting feature
X = ds.drop(ds.columns[-1], axis = 1)
y = ds[ds.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 650, test_size = 0.3)

0         5
1         5
2         2
3         2
4         5
         ..
581007    3
581008    3
581009    3
581010    3
581011    3
Name: Cover_Type, Length: 581012, dtype: int64


### Support Vector Machine - Baseline model 

The training this data using the standard SVC() function  takes an exceedingly long time. Per the [sci-kit learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html):

*"For large datasets consider using LinearSVC or SGDClassifier instead, possibly after a Nystroem transformer or other Kernel Approximation"*

In [6]:
from sklearn.svm import LinearSVC

model_lsvc = LinearSVC(random_state = 650)
model_lsvc.fit(X_train,y_train) 



In [101]:
#predicting forest type on test data using first linear SVC model:  
lsvc_predict = model_lsvc.predict(X_test)

#print accuracy score: 
print(accuracy_score(lsvc_predict, y_test))



ValueError: X has 54 features, but LinearSVC is expecting 30 features as input.

In [33]:
print('Confusion Matrix:',*confusion_matrix(y_test,lsvc_predict), sep="\n")

Confusion Matrix:
[43325 18867    59     0     5     3  1377]
[15084 67775  1927     4    28   143   151]
[   0 1150 9251   88    0  185    0]
[  0   0 564 184   0  61   0]
[  65 2461  248    0   40    1    1]
[   0 1829 3045   17    0  277    0]
[2843   40   24    0    0    0 3182]


### Hyperparameter Tuning with RandomizedSearchCV 
(Using RandomizezdSearchCV over GridSearchCV due to size of dataset)

In [9]:
from scipy.stats import loguniform, randint


param_dist = {
    'C': loguniform(.0001, 10000),  #choosing a wide range for C 
    'max_iter': randint(1000, 5000) # randomly sampling between 1000 and 5000
}

In [15]:
from sklearn.model_selection import RandomizedSearchCV

n_iter_search = 5
random_search = RandomizedSearchCV(model_lsvc, 
                                   param_distributions = param_dist, 
                                   n_iter = n_iter_search, 
                                   n_jobs =4, 
                                   cv = 3, 
                                   random_state = 650)

In [16]:
random_search.fit(X_train, y_train)



In [17]:
random_search.best_params_

{'C': 14.367136335397122, 'max_iter': 1902}

In [21]:
best_lsvc = LinearSVC(C = random_search.best_params_['C'], max_iter = random_search.best_params_['max_iter'], random_state = 650)

In [22]:
best_lsvc.fit(X_train,y_train)




In [23]:
#predicting forest type on test data using first linear SVC model:  
lsvc_predict2 = best_lsvc.predict(X_test)

In [24]:
print(accuracy_score(lsvc_predict2, y_test)) #this randomized grid search produced results that were slightly worse than default hp settings. 
# the search space might not have included the best values. 

0.7113950339636497


### KBest feature selection

In [96]:
from sklearn.feature_selection import SelectKBest

k_best_selector = SelectKBest(k = 30)

In [97]:
k_best_selector.fit(X = X, y = y)

In [88]:
print(X.columns[k_best_selector.get_support()])

Index(['Elevation', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4',
       'Soil_Type6', 'Soil_Type10', 'Soil_Type12', 'Soil_Type13',
       'Soil_Type14', 'Soil_Type17', 'Soil_Type22', 'Soil_Type23',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type32', 'Soil_Type35',
       'Soil_Type38', 'Soil_Type39', 'Soil_Type40'],
      dtype='object')


In [98]:
X_train_kbest = k_best_selector.fit_transform(X_train, y_train)
X_test_kbest = k_best_selector.transform(X_test)

In [99]:
model_lsvc.fit(X_train_kbest, y_train)



In [100]:
#predicting forest type on test data using first linear SVC model:  
lsvc_predict3 = model_lsvc.predict(X_test_kbest)

#print accuracy score: 
print(accuracy_score(lsvc_predict3, y_test))

#accuracy = .68, worse performance than including all variables

0.7069028823205434


# Downsampling 

In [35]:
ds.Cover_Type.value_counts()

2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: Cover_Type, dtype: int64

This dataset is pretty heavily imbalanced, which is likely affecting our model's ability to differentiate between different tree types. Let's downsample each class to equalize how many of each class is present. 

In [57]:
each_tree_type = []
for i in ds['Cover_Type'].unique():
    each_tree_type.append(ds[ds['Cover_Type'] == i]) #now each element in this list is a dataframe containing each class of tree


In [68]:
len(each_tree_type[6])

2747

In [69]:
from sklearn.utils import resample

downsampled_trees = []

#downsampling each tree type data frame to match the size of the smallest class, Cover Type 4 with 2747 rows
for i in each_tree_type: 
    downsampled_trees.append(resample(i, replace = False, n_samples = len(each_tree_type[6]), random_state = 650))

In [74]:
#now we need to transform this list of data frames back into one dataframe: 
ds_dsamp = pd.concat(downsampled_trees)

ds_dsamp = ds_dsamp.sample(frac = 1, random_state = 650)

In [78]:
#confirming that we now have a balanced dataset: 
ds_dsamp.Cover_Type.value_counts()

1    2747
7    2747
2    2747
4    2747
3    2747
5    2747
6    2747
Name: Cover_Type, dtype: int64

In [104]:
X = ds_dsamp.drop(ds_dsamp.columns[-1], axis = 1)
y = ds_dsamp[ds_dsamp.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 650, test_size = 0.3)

In [80]:
model_lsvc.fit(X_train,y_train) 



In [81]:
#predicting forest type on test data using first linear SVC model:  
lsvc_predict = model_lsvc.predict(X_test)

#print accuracy score: 
print(accuracy_score(lsvc_predict, y_test))

0.6763737216155313


Apply best parameters to downsampled train fit: 

In [83]:
best_lsvc.fit(X_train,y_train)



In [84]:
lsvc_predict4 = best_lsvc.predict(X_test)

In [85]:
print(accuracy_score(lsvc_predict4, y_test))

0.6768937424163634


### What if we did another Tuning Grid on the downsampled dataset? 

In [103]:
ds_dsamp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19229 entries, 429611 to 577234
Data columns (total 55 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Elevation                           19229 non-null  float64
 1   Aspect                              19229 non-null  float64
 2   Slope                               19229 non-null  float64
 3   Horizontal_Distance_To_Hydrology    19229 non-null  float64
 4   Vertical_Distance_To_Hydrology      19229 non-null  float64
 5   Horizontal_Distance_To_Roadways     19229 non-null  float64
 6   Hillshade_9am                       19229 non-null  float64
 7   Hillshade_Noon                      19229 non-null  float64
 8   Hillshade_3pm                       19229 non-null  float64
 9   Horizontal_Distance_To_Fire_Points  19229 non-null  float64
 10  Wilderness_Area1                    19229 non-null  int64  
 11  Wilderness_Area2                   

In [113]:
param_grid = {
    'C': [11, 12, 13],          # Centered around the best value 14.367136335397122
    'max_iter': [1700, 1800, 1900]  # Centered around the best value 1902
}

In [114]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(model_lsvc, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)



In [115]:
grid_search.best_params_

{'C': 12, 'max_iter': 1700}

In [116]:
best_lsvc3 = LinearSVC(C = grid_search.best_params_['C'], max_iter = grid_search.best_params_['max_iter'], random_state = 650)

In [117]:
best_lsvc3.fit(X_train,y_train)



In [118]:
#predicting forest type on test data using first linear SVC model:  
lsvc_predict5 = best_lsvc3.predict(X_test)

#print accuracy score: 
print(accuracy_score(lsvc_predict5, y_test))

0.6763737216155313
