## SVM (2020 copy)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [None]:
X_train = pd.read_excel("Data/X_train.xlsx")
X_train = X_train.iloc[:, 1:]
X_test = pd.read_csv("X_test.csv")
X_test = X_test.iloc[:, 1:]
Y_train = pd.read_csv("y_train.csv")
Y_test = pd.read_csv("y_test.csv")

In [None]:
def SVM_poly(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_poly', SVC(kernel='poly', random_state=0, max_iter=100000))])
    # Define parameter grid
    param_grid = {'svm_poly__C': [900,1000,1100], 
                  'svm_poly__degree': [3,4,5],
                  'svm_poly__gamma': [0,0.05,0.1],
                  'svm_poly__coef0':[0.6]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)


def SVM_rbf(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values(factorized)    
    '''

    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=100000))])
    # Define parameter grid
    param_grid = {'svm_rbf__C': [100,150,200], 
                  'svm_rbf__gamma': [0.25,0.3,0.35]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)


def SVM_rbf_bal(X_train,Y_train):
    '''
    X_train: Training Set of X values
    Y_train: Training Set of Y values(factorized)    
    '''

    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=100000, class_weight='balanced'))])

    # Define parameter grid
    param_grid = {'svm_rbf__C': [100,200,300], 
                  'svm_rbf__gamma': [0.25,0.3,0.35]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [1]:
#######Polynomial Kernel Function#######
print(datetime.datetime.now()) #computation time
poly = SVM_poly(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(poly.best_score_))
print('Test score:       {:.2f}'.format(poly.score(X_test, Y_test)))
print('Best parameters: {}'.format(poly.best_params_))
print(datetime.datetime.now()) #20min

# Predict classes
y_pred = poly.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print('Polynomial Kernel Function yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted')) 


#######Radial Basis Kernel Function(rbf)#######
print(datetime.datetime.now()) #computation time
rbf = SVM_rbf(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(rbf.best_score_))
print('Test score:       {:.2f}'.format(rbf.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf.best_params_))
print(datetime.datetime.now()) #10min

# Predict classes
y_pred = rbf.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print('Radial Basis Function Kernel yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))


#######Radial Basis Kernel Function(rbf) with Balanced class weights#######
print(datetime.datetime.now()) #computation time
rbf_bal = SVM_rbf_bal(X_train,Y_train)
print('Best CV accuracy: {:.2f}'.format(rbf_bal.best_score_))
print('Test score:       {:.2f}'.format(rbf_bal.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf_bal.best_params_))
print(datetime.datetime.now())#10min

#looking at the confusion matrix of non-balanced rbf we see that the smaller classes don't get more wrong classification.
#Therefore balancing the weights should not influence the outcome greatly which it doesn't


# Predict classes
y_pred = rbf_bal.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred,
                      'True': Y_test})
print('Radial Basis Function Kernel with Balanced class weights yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))

#According to "https://stackoverflow.com/questions/21390570/scikit-learn-svc-coef0-parameter-range" the Sigmoid function
# does not fulfill the definition of a kernel as it is not positive semidefinite. Therefore we will not use it with Support
#Vector Machines.      

NameError: name 'datetime' is not defined

## main (2020)

In [None]:
#######Polynomial Kernel Function#######
param_grid = {'svm_poly__C': [1000], 
              'svm_poly__degree': [4],
              'svm_poly__gamma': [0.05],
              'svm_poly__coef0':[0.6]}
    
print(datetime.datetime.now()) #computation time
poly = functions.SVM_poly(X_train,Y_train, param_grid)
print('Best CV accuracy: {:.4f}'.format(poly.best_score_))
print('Test score:       {:.4f}'.format(poly.score(X_test, Y_test)))
print('Best parameters: {}'.format(poly.best_params_))
print(datetime.datetime.now()) 

# Predict classes
y_pred2 = poly.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred2,
                      'True': Y_test})
print('Polynomial Kernel Function yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted')) 


#######Radial Basis Kernel Function(rbf)#######
param_grid = {'svm_rbf__C': [150], 
              'svm_rbf__gamma': [0.3]} 
print(datetime.datetime.now()) #computation time
rbf = functions.SVM_rbf(X_train,Y_train,param_grid)
print('Best CV accuracy: {:.4f}'.format(rbf.best_score_))
print('Test score:       {:.4f}'.format(rbf.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf.best_params_))
print(datetime.datetime.now())

# Predict classes
y_pred3 = rbf.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred3,
                      'True': Y_test})
print('Radial Basis Function Kernel yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))


#######Radial Basis Kernel Function(rbf) with Balanced class weights#######
param_grid = {'svm_rbf__C': [200], 
              'svm_rbf__gamma': [0.3]} 
print(datetime.datetime.now()) #computation time
rbf_bal = functions.SVM_rbf_bal(X_train,Y_train,param_grid)
print('Best CV accuracy: {:.4f}'.format(rbf_bal.best_score_))
print('Test score:       {:.4f}'.format(rbf_bal.score(X_test, Y_test)))
print('Best parameters: {}'.format(rbf_bal.best_params_))
print(datetime.datetime.now())

# Predict classes
y_pred4 = rbf_bal.predict(X_test)

# Manual confusion matrix as pandas DataFrame
confm = pd.DataFrame({'Predicted': y_pred4,
                      'True': Y_test})
print('Radial Basis Function Kernel with Balanced class weights yields the following confusion matrix:')
print(confm.groupby(['True','Predicted'], sort=True).size().unstack('Predicted'))
