In [168]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score,precision_score,recall_score
from sklearn.model_selection import KFold, LeaveOneOut,cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from time import time
import timeit #imports timeit module
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import random
random.seed(44)

In [103]:
df = pd.read_csv('data/Cataract Data 1.3.csv', na_values='?').dropna()

pd.set_option('display.max_rows', None)

In [153]:
def standardize(x): 
    
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df1 = pd.DataFrame(x_scaled)
    return(df1)

# Logistic Regression:

#### Function to find summaries:

In [185]:
def ML_logistic_regression_summaries(Vars):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 
    
    #set the model
    model = LogisticRegression(solver= 'liblinear', class_weight = 'balanced')

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": model,
                                  "X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    return dfOutput



In [186]:
ML_logistic_regression_summaries(["pdel","iso_point"])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,"LogisticRegression(class_weight='balanced', so...","[pdel, iso_point]",0.8425,0.156345,0.845833,0.886463,0.865672,0.13701,[[203 37]\n [ 26 134]]


#### Function that compares variables:

In [187]:
def logistic_compare_variables(inputList):
    outputList = []

    for i in inputList:
        outputList.append(ML_logistic_regression_summaries(i))

    outputList = pd.concat(outputList)

    return outputList

logistic_compare_variables([["pdel","iso_point"], ["pdel","deldel_G3"], ["iso_point", "deldel_G3"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,"LogisticRegression(class_weight='balanced', so...","[pdel, iso_point]",0.835,0.135185,0.813492,0.915179,0.861345,0.134897,[[205 47]\n [ 19 129]]
0,"LogisticRegression(class_weight='balanced', so...","[pdel, deldel_G3]",0.6675,0.152295,0.571984,0.864706,0.688525,0.129592,[[147 110]\n [ 23 120]]
0,"LogisticRegression(class_weight='balanced', so...","[iso_point, deldel_G3]",0.4825,0.133954,0.566667,0.569038,0.56785,0.129324,[[136 104]\n [103 57]]


# K-Nearest Neighbors:

#### Function to find summaries:

In [189]:
from sklearn import neighbors

def ML_KNN_summaries(Vars,k):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 

    #standardize data
    X = standardize(X)

    #set the model
    model = neighbors.KNeighborsClassifier(n_neighbors = k)

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": model,
                                  "X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    return dfOutput



In [190]:
ML_KNN_summaries(["pdel","iso_point"], k = 3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[pdel, iso_point]",0.9225,0.115081,1.0,0.893103,0.943534,0.164881,[[259 0]\n [ 31 110]]


#### Function to find best K:

In [192]:
def KNN_find_best_k(inputList,_max):
    
    outputList = [pd.DataFrame(data={"X Variable(s)": [inputList,],
                                      "Avg Accuracy": 0.0,
                                      "SD of Accuracy": 0.0,
                                      "Sensitivity": 0.0,
                                      "Precision": 0.0,
                                      "F1-Score": 0.0,
                                      "Runtime": 0.0,
                                      "Confusion_Matrix": "none"})] #,"Real Specificity": [1,]
    
    for i in range(1,_max):
        
        tempList = []
        
        for j in range(10):
            tempList.append(ML_KNN_summaries(inputList,i))
            
        tempList = pd.concat(tempList, ignore_index=True)

        outputList.append(tempList.head(1))

    outputList = pd.concat(outputList, ignore_index=True)
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

KNN_find_best_k(["deldel_G3","iso_point"],5)

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix,Model
3,"[deldel_G3, iso_point]",0.935,0.079215,1.0,0.908127,0.951852,0.158639,[[257 0]\n [ 26 117]],KNeighborsClassifier(n_neighbors=3)
2,"[deldel_G3, iso_point]",0.905,0.120312,1.0,0.868966,0.929889,0.160414,[[252 0]\n [ 38 110]],KNeighborsClassifier(n_neighbors=2)
4,"[deldel_G3, iso_point]",0.8975,0.079017,1.0,0.868167,0.929432,0.162376,[[270 0]\n [ 41 89]],KNeighborsClassifier(n_neighbors=4)
1,"[deldel_G3, iso_point]",0.88,0.071414,0.916981,0.903346,0.910112,0.165283,[[243 22]\n [ 26 109]],KNeighborsClassifier(n_neighbors=1)
0,"[deldel_G3, iso_point]",0.0,0.0,0.0,0.0,0.0,0.0,none,


#### Function to compare variables:

In [193]:
def KNN_compare_variables(inputList,k):
    outputList = []

    for i in inputList:
        outputList.append(ML_KNN_summaries(i,k))

    outputList = pd.concat(outputList, ignore_index=True)

    return outputList

KNN_compare_variables([["pdel","iso_point"], ["pdel","deldel_G3"], ["iso_point", "deldel_G3"]],3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[pdel, iso_point]",0.935,0.085294,1.0,0.910653,0.953237,0.162231,[[265 0]\n [ 26 109]]
1,KNeighborsClassifier(n_neighbors=3),"[pdel, deldel_G3]",0.85,0.109545,0.9,0.873134,0.886364,0.159796,[[234 26]\n [ 34 106]]
2,KNeighborsClassifier(n_neighbors=3),"[iso_point, deldel_G3]",0.93,0.08124,1.0,0.897436,0.945946,0.160455,[[245 0]\n [ 28 127]]


# LDA:

#### Function to find summaries:

In [198]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def ML_LDA_summaries(Vars):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 

    #standardize data
    X = standardize(X)

    #set the model
    model = LinearDiscriminantAnalysis()

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": model,
                                  "X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    return dfOutput


In [199]:
ML_LDA_summaries(["pdel","iso_point"])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,LinearDiscriminantAnalysis(),"[pdel, iso_point]",0.835,0.127574,0.921933,0.846416,0.882562,0.143173,[[248 21]\n [ 45 86]]


#### Function to compare variables:

In [196]:
def LDA_compare_variables(inputList):
    outputList = []

    for i in inputList:
        outputList.append(ML_LDA_summaries(i))

    outputList = pd.concat(outputList)

    return outputList

LDA_compare_variables([["pdel","iso_point"], ["pdel","deldel_G3"], ["iso_point", "deldel_G3"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,LinearDiscriminantAnalysis(),"[pdel, iso_point]",0.79,0.117898,0.920152,0.793443,0.852113,0.142625,[[242 21]\n [ 63 74]]
0,LinearDiscriminantAnalysis(),"[pdel, deldel_G3]",0.8675,0.09324,0.900383,0.896947,0.898662,0.139074,[[235 26]\n [ 27 112]]
0,LinearDiscriminantAnalysis(),"[iso_point, deldel_G3]",0.645,0.111692,0.940075,0.665782,0.779503,0.137585,[[251 16]\n [126 7]]


# QDA

#### Function to find summaries:

In [212]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def ML_QDA_summaries(Vars):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 

    #standardize data
    X = standardize(X)

    #set the model
    model = QuadraticDiscriminantAnalysis()

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": model,
                                  "X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    return dfOutput

In [213]:
ML_QDA_summaries(["pdel","iso_point"])

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.d

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,QuadraticDiscriminantAnalysis(),"[pdel, iso_point]",0.8925,0.140334,1.0,0.856187,0.922523,0.142837,[[256 0]\n [ 43 101]]
