## Support Vector Machine

For Support Vector Machine, this is the outline of what I will be doing
<pre>
For each dataset:
    - read in the data and separate the labels from the data

Build the GridSearchCV by defining: regularization parameter, radial width, cv, pipeline for estimator, and params

Loop x3:
    - draw 5k random samples as training data, set aside rest for testing
    - GridSearchCV on training data to find the best hyperparameters
    - pick the best classifier, train on all training data (as it has been fit on the last CV fold from the gridsearch)
    - get the training and testing accuracy of this best classifier
    - record everything final analysis
</pre>

In [1]:
# MAKE SURE TO RUN THIS CELL BEFORE MAKING EDITS

# import libaries and set global variables
import os.path
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
# read in the datasets for logistic regression
adult = pd.read_csv('../Data/adult_clean.csv')
cover = pd.read_csv('../Data/cover_clean.csv')
letter_p1 = pd.read_csv('../Data/letter_p1.csv')
letter_p2 = pd.read_csv('../Data/letter_p2.csv')

In [3]:
# separate the datasets into the data (X) and labels (Y)
adult_X = adult.iloc[:,:-1]
adult_Y = adult.iloc[:,-1]

cover_X = cover.iloc[:,:-1]
cover_Y = cover.iloc[:,-1]

letter_p1_X = letter_p1.iloc[:,:-1]
letter_p1_Y = letter_p1.iloc[:,-1]

letter_p2_X = letter_p2.iloc[:,:-1]
letter_p2_Y = letter_p2.iloc[:,-1]

In [4]:
# get regularization parameter values
reg = [0]+list(np.geomspace(10**-7,10**3, 11))

# get the radial width
radial = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1,2]

# define the cross-validation
cv = StratifiedKFold(n_splits=5)

# build the pipeline
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', svm.SVC())])


# setting up the parameters
param = [
    {'classifier__kernel': ['linear'], 'classifier__C': reg},
    {'classifier__kernel': ['poly'], 'classifier__gamma': [2,3], 'classifier__C': reg},
    {'classifier__kernel': ['rbf'], 'classifier__gamma': radial, 'classifier__C': reg}
]

# # create grid search
clf = GridSearchCV(estimator=pipe, param_grid=param, cv=cv, n_jobs=-1, verbose=0)

### Run SVM on Adult dataset for 3 trials

In [None]:
%%time

# remove results file if it exists
if os.path.isfile("Results/best_adult_results.txt"):
    os.remove("Results/best_adult_results.txt")

# run for 3 trials
for trial in [1,2,3]:
    print("Running trial:",trial)
    
    # split the data into 5000 points for training and save the rest for testing
    adult_X_train, adult_X_test, adult_Y_train, adult_Y_test = train_test_split(adult_X,adult_Y,
                                                                              train_size=5000, random_state=trial)
    
    # fit the grid search on adult dataset and save the model
    adult_model = clf.fit(adult_X_train, adult_Y_train)
    joblib.dump(adult_model, 'Models/adult_model_'+str(trial)+'.pkl', compress=1)
    
    print("Fit on CV folds")
    
    # fit best parameters to training set and save the model
    best_adult_model = adult_model.fit(adult_X_train, adult_Y_train)
    joblib.dump(best_adult_model, 'Models/best_adult_model_'+str(trial)+'.pkl', compress=1)
    
    print("Fit on training set")
    
    # get the accuracy score for the testing and training data
    train_acc = accuracy_score(y_true=adult_Y_train, y_pred=best_adult_model.predict(adult_X_train))
    test_acc = accuracy_score(y_true=adult_Y_test, y_pred=best_adult_model.predict(adult_X_test))
    
    print("Predicted accuracy scores")
    
    # write the accuracy score into a file
    f = open("Results/best_adult_results.txt", "a")
    f.write('Trial '+str(trial)+":\n")
    f.write('\tAccuracy %.2f%% (average over CV test folds)\n' % (100 * best_adult_model.best_score_))
    f.write('\tBest Parameters: %s\n' % best_adult_model.best_params_)
    f.write('\tTraining Accuracy: %.2f%%\n' % (100 * train_acc))
    f.write('\tTest Accuracy: %.2f%%\n\n' % (100 * test_acc))
    f.close()

Running trial: 1


### Run SVM on Cover dataset for 3 trials

In [None]:
%%time

# remove results file if it exists
if os.path.isfile("Results/best_cover_results.txt"):
    os.remove("Results/best_cover_results.txt")

# run for 3 trials
for trial in [1,2,3]:
    print("Running trial:",trial)    # print the trial number

    # split the data into 5000 points for training and save the rest for testing
    cover_X_train, cover_X_test, cover_Y_train, cover_Y_test = train_test_split(cover_X,cover_Y,
                                                                              train_size=5000, random_state=trial)
    
    # fit the grid search on cover dataset and save the model
    cover_model = clf.fit(cover_X_train, cover_Y_train)
    joblib.dump(cover_model, 'Models/cover_model_'+str(trial)+'.pkl', compress=1)
    
    # fit best parameters to training set and save the model
    best_cover_model = cover_model.fit(cover_X_train, cover_Y_train)
    joblib.dump(best_cover_model, 'Models/best_cover_model_'+str(trial)+'.pkl', compress=1)
    
    train_acc = accuracy_score(y_true=cover_Y_train, y_pred=best_cover_model.predict(cover_X_train))
    test_acc = accuracy_score(y_true=cover_Y_test, y_pred=best_cover_model.predict(cover_X_test))
    
    f = open("Results/best_cover_results.txt", "a")
    f.write('Trial '+str(trial)+":\n")
    f.write('\tAccuracy %.2f%% (average over CV test folds)\n' % (100 * best_cover_model.best_score_))
    f.write('\tBest Parameters: %s\n' % best_cover_model.best_params_)
    f.write('\tTraining Accuracy: %.2f%%\n' % (100 * train_acc))
    f.write('\tTest Accuracy: %.2f%%\n\n' % (100 * test_acc))
    f.close()

### Run SVM on Letter_p1 dataset for 3 trials

In [None]:
%%time

# remove results file if it exists
if os.path.isfile("Results/best_letter_p1_results.txt"):
    os.remove("Results/best_letter_p1_results.txt")

# run for 3 trials
for trial in [1,2,3]:
    print("Running trial:",trial)

    # split the data into 5000 points for training and save the rest for testing   
    letter_p1_X_train, letter_p1_X_test, letter_p1_Y_train, letter_p1_Y_test = train_test_split(letter_p1_X,letter_p1_Y,
                                                                              train_size=5000, random_state=trial)
    
    # fit the grid search on letter_p1 dataset and save the model
    letter_p1_model = clf.fit(letter_p1_X_train, letter_p1_Y_train)
    joblib.dump(letter_p1_model, 'Models/letter_p1_model_'+str(trial)+'.pkl', compress=1)
    
    # fit best parameters to training set and save the model
    best_letter_p1_model = letter_p1_model.fit(letter_p1_X_train, letter_p1_Y_train)
    joblib.dump(best_letter_p1_model, 'Models/best_letter_p1_model_'+str(trial)+'.pkl', compress=1)
    
    train_acc = accuracy_score(y_true=letter_p1_Y_train, y_pred=best_letter_p1_model.predict(letter_p1_X_train))
    test_acc = accuracy_score(y_true=letter_p1_Y_test, y_pred=best_letter_p1_model.predict(letter_p1_X_test))
    
    f = open("Results/best_letter_p1_results.txt", "a")
    f.write('Trial '+str(trial)+":\n")
    f.write('\tAccuracy %.2f%% (average over CV test folds)\n' % (100 * best_letter_p1_model.best_score_))
    f.write('\tBest Parameters: %s\n' % best_letter_p1_model.best_params_)
    f.write('\tTraining Accuracy: %.2f%%\n' % (100 * train_acc))
    f.write('\tTest Accuracy: %.2f%%\n\n' % (100 * test_acc))
    f.close()

### Run SVM on Letter_p2 datasets for 3 trials

In [None]:
%%time

# remove results file if it exists
if os.path.isfile("Results/best_letter_p2_results.txt"):
    os.remove("Results/best_letter_p2_results.txt")
    
# run for 3 trials
for trial in [1,2,3]:
    print("Running trial:",trial)
    
    # split the data into 5000 points for training and save the rest for testing
    letter_p2_X_train, letter_p2_X_test, letter_p2_Y_train, letter_p2_Y_test = train_test_split(letter_p2_X,letter_p2_Y,
                                                                              train_size=5000, random_state=trial)

    # fit the grid search on letter_p2 dataset and save the model
    letter_p2_model = clf.fit(letter_p2_X_train, letter_p2_Y_train)
    joblib.dump(letter_p2_model, 'Models/letter_p2_model_'+str(trial)+'.pkl', compress=1)
    
    # fit best parameters to training set and save the model
    best_letter_p2_model = letter_p2_model.fit(letter_p2_X_train, letter_p2_Y_train)
    joblib.dump(best_letter_p2_model, 'Models/best_letter_p2_model_'+str(trial)+'.pkl', compress=1)
    
    train_acc = accuracy_score(y_true=letter_p2_Y_train, y_pred=best_letter_p2_model.predict(letter_p2_X_train))
    test_acc = accuracy_score(y_true=letter_p2_Y_test, y_pred=best_letter_p2_model.predict(letter_p2_X_test))
    
    f = open("Results/best_letter_p2_results.txt", "a")
    f.write('Trial '+str(trial)+":\n")
    f.write('\tAccuracy %.2f%% (average over CV test folds)\n' % (100 * best_letter_p2_model.best_score_))
    f.write('\tBest Parameters: %s\n' % best_letter_p2_model.best_params_)
    f.write('\tTraining Accuracy: %.2f%%\n' % (100 * train_acc))
    f.write('\tTest Accuracy: %.2f%%\n\n' % (100 * test_acc))
    f.close()