## Support Vector Machines

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [17]:
# get train and test sets (this will not be necessary in the final code)

X_train = pd.read_csv("Data/X_train.csv").iloc[:, 1:]
Y_train = pd.read_csv("Data/Y_train.csv").iloc[:, 1:]
X_test = pd.read_csv("Data/X_test.csv").iloc[:, 1:]
Y_test = pd.read_csv("Data/Y_test.csv").iloc[:, 1:]

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((17506, 93), (17506, 1), (4340, 93), (4340, 1))

In [30]:
Y = pd.concat([Y_train, Y_test])
Y.value_counts(normalize = True)

strategy
2.0         0.516250
0.0         0.435183
1.0         0.048567
dtype: float64

### 1 Polynomial Kernel Function

In [19]:
# function

def SVM_poly(X_train,Y_train):
    '''
    X_train: Training Set of X values
    y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_poly', SVC(kernel='poly', random_state=0, max_iter=10000 ))])
    # Define parameter grid
    param_grid = {'svm_poly__C': [10], 
                  'svm_poly__degree': [20],
                  'svm_poly__gamma': [0.01],
                  'svm_poly__coef0':[0.1]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [20]:
# output

print(datetime.datetime.now()) #computation time
poly = SVM_poly(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(poly.best_score_))
print('Test score:       {:.2f}'.format(poly.score(X_test, Y_test)))
print('Best parameters: {}'.format(poly.best_params_))
print(datetime.datetime.now())

# Predict classes
y_pred = poly.predict(X_test)


# # Manual confusion matrix as pandas DataFrame
# confm = pd.DataFrame({'Predicted': y_pred,
#                       'True': y_test})
# print('Polynomial Kernel Function yields the following confusion matrix:')
# print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted')) 

2021-04-12 22:20:23.347280


  return f(**kwargs)


Best CV accuracy: 0.29
Test score:       0.69
Best parameters: {'svm_poly__C': 10, 'svm_poly__coef0': 0.1, 'svm_poly__degree': 20, 'svm_poly__gamma': 0.01}
2021-04-12 22:22:32.356191


### 2 Radial Basis Function

In [62]:
# function

def SVM_rbf(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=10000))])
    # Define parameter grid
    param_grid = {'svm_rbf__C': [255, 260], 
                  'svm_rbf__gamma': [0.45]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [18]:
#######Radial Basis Kernel Function(rbf)#######
print(datetime.datetime.now()) #computation time
rbf = SVM_rbf(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(rbf.best_score_))
print('Test score:       {:.2f}'.format(rbf.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf.best_params_))
print(datetime.datetime.now()) #10min

# Predict classes
y_pred = rbf.predict(X_test)

# # Manual confusion matrix as pandas DataFrame
# confm = pd.DataFrame({'Predicted': y_pred,
#                       'True': Y_test})
# print('Radial Basis Function Kernel yields the following confusion matrix:')
# print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))

2021-04-12 17:15:29.579859


  return f(**kwargs)


Best CV accuracy: 0.58
Test score:       0.25
Best parameters: {'svm_rbf__C': 260, 'svm_rbf__gamma': 0.45}
2021-04-12 17:17:05.661179


### 3 Radial Basis Fuction (balanced)

In [52]:
# function

def SVM_rbf_bal(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=1000, class_weight='balanced'))])

    # Define parameter grid
    param_grid = {'svm_rbf__C': [100,200,300], 
                  'svm_rbf__gamma': [0.25,0.3,0.35]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [53]:
#######Radial Basis Kernel Function(rbf) with Balanced class weights#######
print(datetime.datetime.now()) #computation time
rbf_bal = SVM_rbf_bal(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(rbf_bal.best_score_))
print('Test score:       {:.2f}'.format(rbf_bal.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf_bal.best_params_))
print(datetime.datetime.now())#10min

#looking at the confusion matrix of non-balanced rbf we see that the smaller classes don't get more wrong classification.
#Therefore balancing the weights should not influence the outcome greatly which it doesn't


# Predict classes
y_pred = rbf_bal.predict(X_test)

# # Manual confusion matrix as pandas DataFrame
# confm = pd.DataFrame({'Predicted': y_pred,
#                       'True': Y_test})
# print('Radial Basis Function Kernel with Balanced class weights yields the following confusion matrix:')
# print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))

#According to "https://stackoverflow.com/questions/21390570/scikit-learn-svc-coef0-parameter-range" the Sigmoid function
# does not fulfill the definition of a kernel as it is not positive semidefinite. Therefore we will not use it with Support
#Vector Machines.      

2021-04-12 15:44:12.645225


  return f(**kwargs)


Best CV accuracy: 0.42
Test score:       0.72
Best parameters: {'svm_rbf__C': 300, 'svm_rbf__gamma': 0.25}
2021-04-12 15:47:23.795244


### 4 Sigmoid Function

In [58]:
# function

def SVM_sig(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_sig', SVC(kernel='sigmoid', random_state=0, max_iter=1000))])

    # Define parameter grid
    param_grid = {'svm_sig__C': [100,200,300], 
                  'svm_sig__gamma': [0.25,0.3,0.35]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [59]:
#######Sigmoid function#######
print(datetime.datetime.now()) #computation time
sig = SVM_sig(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(sig.best_score_))
print('Test score:       {:.2f}'.format(sig.score(X_test, Y_test)))
print('Best parameters: {}'.format(sig.best_params_))
print(datetime.datetime.now())#10min


# Predict classes
y_pred = sig.predict(X_test)

2021-04-12 15:56:58.335954


  return f(**kwargs)


Best CV accuracy: 0.54
Test score:       0.34
Best parameters: {'svm_sig__C': 200, 'svm_sig__gamma': 0.25}
2021-04-12 16:00:58.453549


## main (2020)

In [None]:
def SVM_poly(X_train,Y_train,param_grid):
    '''
    This function uses Support Vector Machines on X_train and Y_train with a Polynomial Kernel Function.
    It uses Grid-Crossvalidation to find the best hyperparameters for the dataset
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :param param_grid: Grid of parameters to optimize over
    :return: Cross-Validated optimal hyperparamer model fit
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_poly', SVC(kernel='poly', random_state=0, max_iter=100000))])

    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [None]:
def SVM_rbf(X_train,Y_train,param_grid):
    '''
    This function uses Support Vector Machines on X_train and Y_train with a Radial Basis Kernel Function(rbf).
    It uses Grid-Crossvalidation to find the best hyperparameters for the dataset
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :param param_grid: Grid of parameters to optimize over
    :return: Cross-Validated optimal hyperparamer model fit
    '''

    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=100000))])
    
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [None]:
def SVM_rbf_bal(X_train,Y_train,param_grid):
    '''
    This function uses Support Vector Machines on X_train and Y_train with a Radial Basis Kernel Function(rbf).
    It uses Grid-Crossvalidation to find the best hyperparameters for the dataset where we use balanced class weights.
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :param param_grid: Grid of parameters to optimize over
    :return: Cross-Validated optimal hyperparamer model fit
    '''

    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=100000, class_weight='balanced'))])

    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [None]:
#######Polynomial Kernel Function#######
param_grid = {'svm_poly__C': [1000], 
              'svm_poly__degree': [4],
              'svm_poly__gamma': [0.05],
              'svm_poly__coef0':[0.6]}
    
print(datetime.datetime.now()) #computation time
poly = functions.SVM_poly(X_train,Y_train, param_grid)
print('Best CV accuracy: {:.4f}'.format(poly.best_score_))
print('Test score:       {:.4f}'.format(poly.score(X_test, Y_test)))
print('Best parameters: {}'.format(poly.best_params_))
print(datetime.datetime.now()) 

# Predict classes
y_pred2 = poly.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred2,
                      'True': Y_test})
print('Polynomial Kernel Function yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted')) 


#######Radial Basis Kernel Function(rbf)#######
param_grid = {'svm_rbf__C': [150], 
              'svm_rbf__gamma': [0.3]} 
print(datetime.datetime.now()) #computation time
rbf = functions.SVM_rbf(X_train,Y_train,param_grid)
print('Best CV accuracy: {:.4f}'.format(rbf.best_score_))
print('Test score:       {:.4f}'.format(rbf.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf.best_params_))
print(datetime.datetime.now())

# Predict classes
y_pred3 = rbf.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred3,
                      'True': Y_test})
print('Radial Basis Function Kernel yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))


#######Radial Basis Kernel Function(rbf) with Balanced class weights#######
param_grid = {'svm_rbf__C': [200], 
              'svm_rbf__gamma': [0.3]} 
print(datetime.datetime.now()) #computation time
rbf_bal = functions.SVM_rbf_bal(X_train,Y_train,param_grid)
print('Best CV accuracy: {:.4f}'.format(rbf_bal.best_score_))
print('Test score:       {:.4f}'.format(rbf_bal.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf_bal.best_params_))
print(datetime.datetime.now())

# Predict classes
y_pred4 = rbf_bal.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred4,
                      'True': Y_test})
print('Radial Basis Function Kernel with Balanced class weights yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))
