## Support Vector Machines

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [4]:
X_train = pd.read_csv("Data/X_train.csv", sep = ",", index_col = 0).iloc[:, :-1]
y_train = pd.read_csv("Data/Y_train.csv", sep = ",", index_col = 0)
X_test = pd.read_csv("Data/X_test.csv", sep = ",", index_col = 0).iloc[:, :-1]
y_test = pd.read_csv("Data/Y_test.csv", sep = ",", index_col = 0)

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((17506, 92), (17506,), (4340, 92), (4340,))

In [8]:
## Undersample the Data to make it balanced
from imblearn.over_sampling import RandomOverSampler

# create the object.
over_sampler = RandomOverSampler()

# fit the object to the training data.
x_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)

y_train_over1 = pd.DataFrame(y_train_over, columns = ["strategy"])
y_train_over1.strategy.value_counts()

y_train_over1 = np.ravel(y_train_over1)

In [None]:
# # get train and test sets (this will not be necessary in the final code)

# X_train = pd.read_csv("Data/X_train.csv").iloc[:, 1:]
# Y_train = pd.read_csv("Data/Y_train.csv").iloc[:, 1:]
# X_test = pd.read_csv("Data/X_test.csv").iloc[:, 1:]
# Y_test = pd.read_csv("Data/Y_test.csv").iloc[:, 1:]

# X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

# # other approach (test_train_split)
# train = pd.read_csv("Data/Clean Test Set.csv").iloc[:, 1:]
# test = pd.read_csv("Data/Clean Training Set.csv").iloc[:, 1:]
# df = pd.concat([test, train])
# # Assign feature columns to x and y
# x = df.iloc[:, :-1]
# y = df.iloc[:, -1:]
# y = np.ravel(y)
# # Train, test set split
# x_train, x_test, y_train, y_test = train_test_split(x, y, 
#                                                     test_size=0.2, 
#                                                     random_state=0, 
#                                                     stratify=y)

### 1 Polynomial Kernel Function

In [9]:
# function

def SVM_poly(X_train,Y_train):
    '''
    X_train: Training Set of X values
    y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_poly', SVC(kernel='poly', random_state=0, max_iter=100 ))])
    # Define parameter grid
    param_grid = {'svm_poly__C': [10], 
                  'svm_poly__degree': [20],
                  'svm_poly__gamma': [0.01],
                  'svm_poly__coef0':[0.1]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [10]:
# output
print(datetime.datetime.now()) #computation time
poly = SVM_poly(x_train_over,y_train_over1)
print('Best CV accuracy: {:.2f}'.format(poly.best_score_))
print('Test score:       {:.2f}'.format(poly.score(X_test, y_test)))
print('Best parameters: {}'.format(poly.best_params_))
print(datetime.datetime.now())

# Predict classes
y_pred = poly.predict(X_test)



# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': y_test})
print('Polynomial Kernel Function yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted')) 

2021-04-13 17:44:33.579053




Best CV accuracy: 0.32
Test score:       0.04
Best parameters: {'svm_poly__C': 10, 'svm_poly__coef0': 0.1, 'svm_poly__degree': 20, 'svm_poly__gamma': 0.01}
2021-04-13 17:50:44.178947
Polynomial Kernel Function yields the following confusion matrix:
Predicted   0.0     1.0    2.0
True                          
0.0        15.0  2952.0  155.0
1.0         NaN   141.0    7.0
2.0        14.0  1017.0   39.0


### 2 Radial Basis Function

In [11]:
# function

def SVM_rbf(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=100))])
    # Define parameter grid
    param_grid = {'svm_rbf__C': [255], 
                  'svm_rbf__gamma': [0.45]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [15]:
#######Radial Basis Kernel Function(rbf)#######
print(datetime.datetime.now()) #computation time
rbf = SVM_rbf(x_train_over,y_train_over1)
print('Best CV accuracy: {:.2f}'.format(rbf.best_score_))
print('Test score:       {:.2f}'.format(rbf.score(X_test, y_test)))
print('Best parameters: {}'.format(rbf.best_params_))
print(datetime.datetime.now()) #10min

# Predict classes
y_pred = rbf.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': y_test})
print('Radial Basis Function Kernel yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))

2021-04-13 17:53:49.667097




Best CV accuracy: 0.32
Test score:       0.20
Best parameters: {'svm_rbf__C': 255, 'svm_rbf__gamma': 0.45}
2021-04-13 17:53:54.252837
Radial Basis Function Kernel yields the following confusion matrix:
Predicted  1.0   2.0
True                
0.0        621  2501
1.0         22   126
2.0        232   838


### 3 Radial Basis Fuction (balanced)

In [16]:
# function

def SVM_rbf_bal(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=100, class_weight='balanced'))])

    # Define parameter grid
    param_grid = {'svm_rbf__C': [200], 
                  'svm_rbf__gamma': [0.35]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [17]:
#######Radial Basis Kernel Function(rbf) with Balanced class weights#######
print(datetime.datetime.now()) #computation time
rbf_bal = SVM_rbf_bal(x_train_over,y_train_over1)
print('Best CV accuracy: {:.2f}'.format(rbf_bal.best_score_))
print('Test score:       {:.2f}'.format(rbf_bal.score(X_test, y_test)))
print('Best parameters: {}'.format(rbf_bal.best_params_))
print(datetime.datetime.now())#10min

#looking at the confusion matrix of non-balanced rbf we see that the smaller classes don't get more wrong classification.
#Therefore balancing the weights should not influence the outcome greatly which it doesn't


# Predict classes
y_pred = rbf_bal.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': y_test})
print('Radial Basis Function Kernel with Balanced class weights yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))

# According to "https://stackoverflow.com/questions/21390570/scikit-learn-svc-coef0-parameter-range" the Sigmoid function
# does not fulfill the definition of a kernel as it is not positive semidefinite. Therefore we will not use it with Support
# Vector Machines.      

2021-04-13 17:55:01.347948




Best CV accuracy: 0.34
Test score:       0.34
Best parameters: {'svm_rbf__C': 200, 'svm_rbf__gamma': 0.35}
2021-04-13 17:55:05.791005
Radial Basis Function Kernel with Balanced class weights yields the following confusion matrix:
Predicted  0.0   2.0
True                
0.0        621  2501
1.0         22   126
2.0        232   838


### 4 Sigmoid Function

In [34]:
# function

def SVM_sig(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_sig', SVC(kernel='sigmoid', random_state=0, max_iter=100))])

    # Define parameter grid
    param_grid = {'svm_sig__C': [70], 
                  'svm_sig__gamma': [0.1]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [35]:
#######Sigmoid function#######
print(datetime.datetime.now()) #computation time
sig = SVM_sig(x_train_over,y_train_over1)
print('Best CV accuracy: {:.2f}'.format(sig.best_score_))
print('Test score:       {:.2f}'.format(sig.score(X_test, y_test)))
print('Best parameters: {}'.format(sig.best_params_))
print(datetime.datetime.now())#10min


# Predict classes
y_pred = sig.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': y_test})
print('Sigmoid Function Kernel yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))


2021-04-13 18:05:31.243019




Best CV accuracy: 0.35
Test score:       0.63
Best parameters: {'svm_sig__C': 70, 'svm_sig__gamma': 0.1}
2021-04-13 18:05:39.356972
Sigmoid Function Kernel yields the following confusion matrix:
Predicted   0.0  1.0  2.0
True                     
0.0        2521   85  516
1.0         129    3   16
2.0         856   14  200
