# Model Comparisions

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler, minmax_scale, MaxAbsScaler, Normalizer, QuantileTransformer, PowerTransformer
from sklearn.linear_model import LogisticRegression

# Support vector machine linear classifier
from sklearn.svm import SVC
import joblib

from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import warnings
warnings.simplefilter('ignore', FutureWarning)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Define a decode function
def decode(datum):
    return np.argmax(datum)

def decodeResults(encoded_predictions):
    labeled_predictions =[]
    for i in range(encoded_predictions.shape[0]):
        decoded_datum = decode(encoded_predictions[i])
        labeled_predictions.append(decoded_datum)
    return labeled_predictions

def reshapeResults(predictions):
    return predictions.reshape(-1,1)

def convertToBinary(predictions):
    binary = []
    for prediction in predictions:
        if (prediction == 'buff'):
            binary.append(0)
        else:
            binary.append(1)
    return binary

In [3]:
def scatterPlot(index, population1, population2, label1, label2, title, ylabel):
    # Scatter Plot of Data
    plt.figure(figsize=(15,5))
    plt.subplot(1,1,1)
    plt.scatter(index, population1, marker='+', color='g', label=label1)
    plt.scatter(index, population2, marker='x', color='r', label=label2)
    plt.title(title,color='k', size=14, weight='bold')
    plt.xlabel("index")
    plt.ylabel(ylabel)
    plt.legend(loc="best")
    plt.grid(alpha=0.5)

## Load data

In [4]:
df = pd.read_csv('../ETL/cleveland_clean.csv')
df.head()

Unnamed: 0,age,sex,chestPain,trestBps,cholesterol,bloodSugar,ecg,maxHeartRate,exerciseInducedAngina,oldPeak,slope,vesselsColored,thal,buff
0,63.0,male,angina,145.0,233.0,true,hyp,150.0,fal,2.3,down,0.0,fix,buff
1,67.0,male,asympt,160.0,286.0,fal,hyp,108.0,true,1.5,flat,3.0,norm,sick
2,67.0,male,asympt,120.0,229.0,fal,hyp,129.0,true,2.6,flat,2.0,rev,sick
3,37.0,male,notang,130.0,250.0,fal,norm,187.0,fal,3.5,down,0.0,norm,buff
4,41.0,fem,abnang,130.0,204.0,fal,hyp,172.0,fal,1.4,up,0.0,norm,buff


In [5]:
X = df.drop('buff', axis =1)
y = df['buff']
X['vesselsColored']=X['vesselsColored'].astype(str)

In [6]:
X=pd.get_dummies(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
# Apply the MinMax Scaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Label encode the dependent variable, then Hot encode it
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)

# Import Machine Learning Models

In [10]:
Results =[]
# Random Forests
RF = joblib.load('tree.sav')
RF_score = RF.score(X_test, y_test)

print(f" score {RF_score}")


 score 0.972972972972973


In [11]:
# K Nearest Neighbors
KNN = joblib.load('KNN.sav')
KNN_score = KNN.best_score_
print(f" score {KNN_score}")

 score 0.8783783783783784


In [12]:
# Support Vector Machines
SVM = joblib.load('SVM.sav')
SVM_score = SVM.score(X_test_scaled, y_test_encoded)
print(f" score {SVM_score}")

 score 0.8648648648648649


In [13]:
# Logistic Regression
LR = joblib.load('LR.sav')
LR_score = LR.score(X_test_scaled, y_test_encoded)
print(f" score {LR_score}")

 score 0.8513513513513513


In [14]:
# Neural Network
NN = load_model("NeuralNetwork.h5")
NN_loss, NN_score = NN.evaluate(X_test_scaled, y_test_one_hot)
print(f' score {NN_score}')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
 score 0.8783783912658691


# Run Test Data Through Each Model

In [15]:
# Random Forests
RF_predict = RF.predict(X_test_scaled)

In [16]:
# K Nearest Neighbor
KNN_predict = KNN.predict(X_test_scaled)

In [17]:
# Support Vector Machines
SVM_predict = SVM.predict(X_test_scaled)

In [18]:
# Logistic Regression
LR_pred = LR.predict(X_test_scaled)

# Convert string result to 0, 1 
LR_predict=[]
for i in range(len(LR_pred)):
    if (LR_pred[i] == 'buff'):
        LR_prd= 0
    else:
        LR_prd=1
    LR_predict.append(LR_prd)
    

In [19]:
# Neural Network
NN_predict = NN.predict(X_test_scaled)

# Zip all Predictions into a Single Data Frame

In [20]:
# First we need to ensure the data is in the correct format
# NOTE:  SVM_predict, LR_predict and y_test_encoded

# y_test only requires label, no index 
y_test_label = y_test

# Need to convert from buff and sick to 1 and 0
RF_prediction = convertToBinary(RF_predict)

# Need to decode results
KNN_prediction = decodeResults(KNN_predict)
NN_prediction = decodeResults(KNN_predict)


In [21]:
results = pd.DataFrame(list(zip(y_test_label, y_test_encoded, SVM_predict, LR_predict, RF_prediction, KNN_prediction, NN_prediction)), 
                       columns = ['Label','Actual','SVM','LR', 'RF', 'KNN', 'NN'])
results

Unnamed: 0,Label,Actual,SVM,LR,RF,KNN,NN
0,buff,0,0,1,0,0,0
1,buff,0,1,1,1,1,1
2,buff,0,0,1,0,1,1
3,sick,1,0,1,0,0,0
4,sick,1,1,1,1,1,1
...,...,...,...,...,...,...,...
69,buff,0,0,1,0,0,0
70,sick,1,1,1,1,1,1
71,sick,1,1,1,1,1,1
72,sick,1,1,1,0,1,1


# Add a Column to Vote for the Final Prediction

In [22]:
results['Vote Sum'] = (results['RF'] + results['KNN'] + results['SVM'] + results['LR'] + results['NN'])
results['Vote Majority'] = np.where(results['Vote Sum'] >= 3, 1, 0)
results['Vote Conservative'] = np.where(results['Vote Sum'] > 0, 1, 0)
results['SVM Result'] = results['Actual'] - results['SVM']
results['LR Result'] = results['Actual'] - results['LR']
results['RF Result'] = results['Actual'] - results['RF']
results['KNN Result'] = results['Actual'] - results['KNN']
results['NN Result'] = results['Actual'] - results['NN']
results['Majority Result'] = results['Actual'] - results['Vote Majority']
results['Conservative Result'] = results['Actual'] - results['Vote Conservative']
results

Unnamed: 0,Label,Actual,SVM,LR,RF,KNN,NN,Vote Sum,Vote Majority,Vote Conservative,SVM Result,LR Result,RF Result,KNN Result,NN Result,Majority Result,Conservative Result
0,buff,0,0,1,0,0,0,1,0,1,0,-1,0,0,0,0,-1
1,buff,0,1,1,1,1,1,5,1,1,-1,-1,-1,-1,-1,-1,-1
2,buff,0,0,1,0,1,1,3,1,1,0,-1,0,-1,-1,-1,-1
3,sick,1,0,1,0,0,0,1,0,1,1,0,1,1,1,1,0
4,sick,1,1,1,1,1,1,5,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,buff,0,0,1,0,0,0,1,0,1,0,-1,0,0,0,0,-1
70,sick,1,1,1,1,1,1,5,1,1,0,0,0,0,0,0,0
71,sick,1,1,1,1,1,1,5,1,1,0,0,0,0,0,0,0
72,sick,1,1,1,0,1,1,4,1,1,0,0,1,0,0,0,0


# Determine Whether a Consensus is Better than a Single Model

In [23]:
results.sum()

Label                  buffbuffbuffsicksickbuffsicksicksicksickbuffsi...
Actual                                                                37
SVM                                                                   33
LR                                                                    74
RF                                                                    31
KNN                                                                   32
NN                                                                    32
Vote Sum                                                             202
Vote Majority                                                         32
Vote Conservative                                                     74
SVM Result                                                             4
LR Result                                                            -37
RF Result                                                              6
KNN Result                                         

In [24]:
svm_final = results['SVM Result'].value_counts()
lr_final = results['LR Result'].value_counts()
rf_final = results['RF Result'].value_counts()
knn_final = results['KNN Result'].value_counts()
nn_final = results['NN Result'].value_counts()
majority_final = results['Majority Result'].value_counts()
conservative_final = results['Conservative Result'].value_counts()
labels = ['Correct', 'False Positive (Incorrectly Found Sick)', 'False Negative (Incorrectly Found Healthy)']
summary = pd.DataFrame(list(zip(labels, svm_final, lr_final, rf_final, knn_final, 
                                nn_final, majority_final, conservative_final)), 
                                columns = ['Result','SVM', 'LR', 'RF', 'KNN', 'NN', 'Majority Vote', 'Conservative Votes'])
summary = summary.set_index('Result')

summary.loc['Model Scores'] = [SVM_score, LR_score, RF_score, KNN_score, NN_score, 'N/A', 'N/A']
summary

Unnamed: 0_level_0,SVM,LR,RF,KNN,NN,Majority Vote,Conservative Votes
Result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Correct,64.0,37.0,64.0,63.0,63.0,63.0,37.0
False Positive (Incorrectly Found Sick),7.0,37.0,8.0,8.0,8.0,8.0,37.0
Model Scores,0.864865,0.851351,0.972973,0.878378,0.878378,,
