In [4]:
# To load pandas
import pandas as pd

# To load numpy
import numpy as np
import xlsxwriter
import os.path

# To import the classifiers

from sklearn.svm import SVC

# To measure accuracy
from sklearn import metrics
from sklearn import model_selection

# To import the scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer

# To support plots
import matplotlib.pyplot as plt

seed = 520
def create_scaler_dummy():
    return DummyScaler()
    
def create_scaler_standard():
    return StandardScaler()



create_scaler = create_scaler_standard



def create_model_svc():
    
    model = SVC(random_state=seed, probability=True)
    return model


create_model = create_model_svc()


np.set_printoptions(precision=3)


file = 'labeleddataIEE520BMI555FinalProject2021.xlsx'
data = pd.read_excel(file, header=0)



vals = data.values
y = vals[:, -1]
X = vals[:, 1:-1]

print('Train the model and predict')
scaler = create_scaler()
scaler.fit(X)
X = scaler.transform(X)                  

model = create_model_svc()             
model.fit(X, y)                          
y_hat = model.predict(X)
print(y_hat)                            

print('Model evaluation (train)')        
print('Accuracy:')
print(metrics.accuracy_score(y, y_hat))
print('Classification report:')
print(metrics.classification_report(y, y_hat))
print('Confusion matrix (train)')
print (metrics.confusion_matrix(y, y_hat))

print('Cross-validation')                
np.random.seed(seed)
y_prob = np.zeros(y.shape)               
y_hat = np.zeros(y.shape)             

print(y_prob)
print(y_hat)
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=seed)



for train, test in kfold.split(X, y):
  
    print(test)
    scaler.fit(X[train])
    X_train = scaler.transform(X[train])
    X_test = scaler.transform(X[test])
    
    model = create_model
    model.fit(X_train, y[train])
    y_prob[test] = model.predict_proba(X_test)[:, 1]
    y_hat[test] = model.predict(X_test)

print(y_hat)
print('Model evaluation (CV)')
print('Accuracy:')
print(metrics.accuracy_score(y, y_hat))
print('Classification report:')
print(metrics.classification_report(y, y_hat))
print('Confusion Matrix (CV)')
print(metrics.confusion_matrix(y, y_hat))

print('Grid Search for Hyperparameters')

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=520)
scaler = create_scaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Grid search is being performed to select values 
model = model_selection.GridSearchCV(SVC(kernel='rbf', random_state=520, probability=True,class_weight='balanced'),
                         cv=5,                                                                  
                         n_jobs=-1,    
                         param_grid={
                                                         
                             'C': [10000],                              
                             'gamma': [0.0001] 
                         })

model.fit(X_train, y_train)
print('Optimal parameters:', model.best_params_)
y_test_hat = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]

print('Model evaluation (Optimal Hyperparameters)')
print('Accuracy:')
print(metrics.accuracy_score(y_test, y_test_hat))
print('Balanced Accuracy:')
print(metrics.balanced_accuracy_score(y_test,y_test_hat))
print('Balanced Error Rate:')
print(1-metrics.balanced_accuracy_score(y_test,y_test_hat))
print('Classification report:')
print(metrics.classification_report(y_test, y_test_hat))
print('Confusion matrix (Optimal Hyperparameters)')
print(metrics.confusion_matrix(y_test, y_test_hat))

file_unl = 'unlabeleddataIEE520BMI555FinalProject2021.xlsx'
data_unl = pd.read_excel(file_unl, header=0)
df_unl=pd.DataFrame({})
vals_unl=data_unl.values
X_unl=vals_unl[:, 1:23]
scaler.fit(X_unl)
X_unl_scaled=scaler.transform(X_unl)
X_last=vals_unl[:,-1]
y_unl_hat=np.zeros(X_last.shape)
y_unl_hat=model.predict(X_unl_scaled)
df_unl['Predicted Values']=y_unl_hat
"""with pd.ExcelWriter('BMI555IEE520Results2021GouthamHariharanKumaran.csv',engine='openpyxl',mode='a') as writer:
  df_unl.to_csv(writer,sheet_name='Testing',header=False)"""

df_unl.to_csv(r'BMI555IEE520Results2021GouthamHariharanKumaran',header=False)

Train the model and predict
[4. 0. 1. ... 5. 1. 4.]
Model evaluation (train)
Accuracy:
0.9415
Classification report:
              precision    recall  f1-score   support

         0.0       0.96      0.94      0.95       791
         1.0       1.00      1.00      1.00       312
         2.0       0.95      0.96      0.96       981
         3.0       0.93      0.93      0.93      2079
         4.0       0.97      0.97      0.97      1114
         5.0       0.96      0.95      0.96      1183
         6.0       0.89      0.90      0.90      1540

    accuracy                           0.94      8000
   macro avg       0.95      0.95      0.95      8000
weighted avg       0.94      0.94      0.94      8000

Confusion matrix (train)
[[ 741    0   35    0    2    2   11]
 [   0  312    0    0    0    0    0]
 [  19    0  945    0   11    1    5]
 [   0    0    0 1941    2   29  107]
 [   1    0   13    8 1077    0   15]
 [   6    0    0   19    0 1128   30]
 [   4    0    0  117   18   13 1