# Support Vector Classifier

## Contents
- [Imports](#Imports)
- [Prepare Dataset](#Prepare-Dataset)
- [OPTION: Cluster Latitude and Longitude data](#OPTION:-Cluster-Latitude-and-Longitude-data)
- [Bootstrap](#Bootstrap)
- [One Hot Encode Features](#One-Hot-Encode-Features)
- [Create Scaled X, y and Train, Test](#Create-Scaled-X,-y-and-Train,-Test)
- [Compile and Fit Support Vector Classifier](#Compile-and-Fit-Support-Vector-Classifier)
- [Model Evaluation](#Model-Evaluation)

## Overview
This notebook explores the Support Vector Classifier. This model was considered for feature importance, but took more resources to run than the Random Forest Classifier. 

# Imports

In [None]:
import boto3
import io
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.metrics import plot_confusion_matrix, recall_score, accuracy_score, f1_score, make_scorer, precision_score

# Prepare Dataset

In [None]:
# Pull in cleaned and combined data from AWS 
noaa_on_fire = pd.read_csv('s3://git-to-amazon-s3-outputbucket-rorni8oehk4l/soulclimberchick/meteorology-fire-impact/data-files/mfi_df_yr_trail.csv')
noaa_on_fire.drop(columns='Unnamed: 0', inplace=True)
noaa_on_fire.dropna(inplace=True)

In [None]:
noaa_on_fire_btstrp.columns

# OPTION: Cluster Latitude and Longitude data

In [None]:
# # Put lat/long into a matrix
# location_data = round(noaa_on_fire[['longitude', 'latitude']],4) # round to decrease grid accuracy of lat/long

# # Cluster lat/long
# km = KMeans(n_clusters=200)
# km.fit(location_data)

# # Append clusters back into model_df
# noaa_on_fire.loc[:, 'cluster'] = km.predict(location_data)

# Bootstrap

In [None]:
noaa_on_fire_btstrp = pd.concat([noaa_on_fire, 
                                 noaa_on_fire[noaa_on_fire['fire_size_class'] == 'C'].sample(n = 100000, replace = True, random_state=11),
                                 noaa_on_fire[noaa_on_fire['fire_size_class'] == 'D'].sample(n = 100000, replace = True, random_state=11),
                                 noaa_on_fire[noaa_on_fire['fire_size_class'] == 'E'].sample(n = 100000, replace = True, random_state=11),
                                 noaa_on_fire[noaa_on_fire['fire_size_class'] == 'F'].sample(n = 100000, replace = True, random_state=11),
                                 noaa_on_fire[noaa_on_fire['fire_size_class'] == 'G'].sample(n = 100000, replace = True, random_state=11)], axis = 0)

# One Hot Encode Features

In [None]:
noaa_on_fire_btstrp = pd.get_dummies(noaa_on_fire, columns=['state', 'month'], drop_first= True)
#noaa_on_fire = pd.get_dummies(noaa_on_fire, columns=['cluster']) # location clustered by lat/long

# Create Scaled X, y and Train, Test

In [None]:
# Set up X and y variables, and bootstrap data

X = noaa_on_fire_btstrp[[ 'pcp', 'tavg', 'pdsi', 'phdi', 'zndx', 'pmdi', 'sp02','sp03', 'sp06', 'sp09', #10
                         'sp12', 'sp24', 'tmin', 'tmax', 'month_2', 'month_3','month_4', 'month_5', 'cdd', 'hdd', #20
                         'month_6', 'month_7', 'month_8', 'month_9','month_10', 'month_11', 'month_12', 'tavg_t12m','tavg_t9m', 'tavg_t6m', #30
                         'tavg_t3m', 'pcp_t12m', 'pcp_t9m', 'pcp_t6m','pcp_t3m', 'pmdi_t12m', 'pmdi_t9m', 'pmdi_t6m', 'pmdi_t3m', 'pdsi_t12m', #40
                         'pdsi_t9m', 'pdsi_t6m', 'pdsi_t3m']] #43

y = noaa_on_fire_btstrp['fire_size_class']

# Train Test Split and scale data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#Standard Scale data
stan = StandardScaler()
X_train = stan.fit_transform(X_train)
X_test = stan.transform(X_test)

#### Model Baseline

In [None]:
# What is the accuracy of our baseline model?
y.value_counts(normalize=True)

# Compile and Fit Support Vector Classifier

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
print(f"Support Vector Classifier achieves accuracy of {round(svc.score(X_train, y_train),4)} on train data and {round(svc.score(X_test, y_test),4)} on test data.")

# Model Evaluation

In [None]:
plot_confusion_matrix(svc, X_test, y_test,cmap = 'YlOrBr', normalize = 'true');