In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score,precision_score,recall_score
from sklearn.model_selection import KFold, LeaveOneOut,cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from time import time
import timeit #imports timeit module
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import random
random.seed(44)

In [3]:
df = pd.read_csv('Cataract Data 1.3.csv', na_values='?').dropna()

pd.set_option('display.max_rows', None)

In [4]:
def standardize(x): 
    
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df1 = pd.DataFrame(x_scaled)
    return(df1)

In [47]:
df
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["ogAA_letterCode"] = le.fit_transform(df.ogAA_letter)
df["mutAA_letterCode"] = le.fit_transform(df.mutAA_letter)
df["ogAA_charCode"] = le.fit_transform(df.ogAA_char)
df["mutAA_charCode"] = le.fit_transform(df.mutAA_char)

# Logistic Regression:

#### Function to find summaries:

In [17]:
def ML_logistic_regression_summaries(Vars):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 
    
    #set the model
    model = LogisticRegression(solver= 'liblinear', class_weight = 'balanced')

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": model,
                                  "X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    return dfOutput



In [18]:
ML_logistic_regression_summaries(["pdel","iso_point"])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,"LogisticRegression(class_weight='balanced', so...","[pdel, iso_point]",0.84,0.128062,0.856589,0.891129,0.873518,0.116159,[[221 37]\n [ 27 115]]


#### Function that compares variables:

In [19]:
def logistic_compare_variables(inputList):
    outputList = []

    for i in inputList:
        outputList.append(ML_logistic_regression_summaries(i))

    outputList = pd.concat(outputList)

    return outputList

logistic_compare_variables([["pdel","iso_point"], ["pdel","deldel_G3"], ["iso_point", "deldel_G3"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,"LogisticRegression(class_weight='balanced', so...","[pdel, iso_point]",0.82,0.107703,0.817073,0.881579,0.848101,0.139682,[[201 45]\n [ 27 127]]
0,"LogisticRegression(class_weight='balanced', so...","[pdel, deldel_G3]",0.645,0.151575,0.529183,0.866242,0.657005,0.115049,[[136 121]\n [ 21 122]]
0,"LogisticRegression(class_weight='balanced', so...","[iso_point, deldel_G3]",0.495,0.151575,0.551181,0.614035,0.580913,0.114198,[[140 114]\n [ 88 58]]


# K-Nearest Neighbors:

#### Function to find summaries:

In [20]:
from sklearn import neighbors

def ML_KNN_summaries(Vars,k):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 

    #standardize data
    X = standardize(X)

    #set the model
    model = neighbors.KNeighborsClassifier(n_neighbors = k)

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": model,
                                  "X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    return dfOutput



In [21]:
ML_KNN_summaries(["pdel","iso_point"], k = 3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[pdel, iso_point]",0.96,0.06245,1.0,0.942238,0.97026,0.172849,[[261 0]\n [ 16 123]]


In [42]:
ML_KNN_summaries(["pdel","iso_point"], k = 3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[pdel, iso_point]",0.9725,0.049937,1.0,0.959854,0.979516,0.253619,[[263 0]\n [ 11 126]]


In [22]:
ML_KNN_summaries(["deldel_G3","iso_point"], k = 3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[deldel_G3, iso_point]",0.9525,0.063196,1.0,0.931655,0.964618,0.179097,[[259 0]\n [ 19 122]]


In [51]:
ML_KNN_summaries(["solv_area", "ogAA_letterCode"], k = 3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[solv_area, ogAA_letterCode]",0.6925,0.123263,0.819608,0.730769,0.772643,0.353957,[[209 46]\n [ 77 68]]


In [52]:
ML_KNN_summaries(["iso_point", "ogAA_letterCode"], k = 3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[iso_point, ogAA_letterCode]",0.9575,0.062799,1.0,0.938628,0.968343,0.15466,[[260 0]\n [ 17 123]]


In [54]:
ML_KNN_summaries(["pdel", "ogAA_letterCode"], k = 3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[pdel, ogAA_letterCode]",0.865,0.110793,0.918288,0.877323,0.897338,0.134503,[[236 21]\n [ 33 110]]


#### Function to find best K:

In [40]:
def KNN_find_best_k(inputList,_max):
    
    outputList = [pd.DataFrame(data={"X Variable(s)": [inputList,],
                                      "Avg Accuracy": 0.0,
                                      "SD of Accuracy": 0.0,
                                      "Sensitivity": 0.0,
                                      "Precision": 0.0,
                                      "F1-Score": 0.0,
                                      "Runtime": 0.0,
                                      "Confusion_Matrix": "none"})] #,"Real Specificity": [1,]
    
    for i in range(1,_max):
        
        tempList = []
        
        for j in range(10):
            tempList.append(ML_KNN_summaries(inputList,i))
            
        tempList = pd.concat(tempList, ignore_index=True)

        outputList.append(tempList.head(1))

    outputList = pd.concat(outputList, ignore_index=True)
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

KNN_find_best_k(["deldel_G3","iso_point"],6)

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix,Model
3,"[deldel_G3, iso_point]",0.9425,0.058683,1.0,0.917857,0.957169,0.215254,[[257 0]\n [ 23 120]],KNeighborsClassifier(n_neighbors=3)
5,"[deldel_G3, iso_point]",0.9175,0.080273,1.0,0.888889,0.941176,0.257126,[[264 0]\n [ 33 103]],KNeighborsClassifier()
4,"[deldel_G3, iso_point]",0.91,0.09434,1.0,0.883495,0.938144,0.249571,[[273 0]\n [ 36 91]],KNeighborsClassifier(n_neighbors=4)
2,"[deldel_G3, iso_point]",0.9125,0.078062,1.0,0.878049,0.935065,0.191945,[[252 0]\n [ 35 113]],KNeighborsClassifier(n_neighbors=2)
1,"[deldel_G3, iso_point]",0.8775,0.093508,0.934066,0.891608,0.912343,0.173488,[[255 18]\n [ 31 96]],KNeighborsClassifier(n_neighbors=1)
0,"[deldel_G3, iso_point]",0.0,0.0,0.0,0.0,0.0,0.0,none,


#### Function to compare variables:

In [55]:
def KNN_compare_variables(inputList,k):
    outputList = []

    for i in inputList:
        outputList.append(ML_KNN_summaries(i,k))

    outputList = pd.concat(outputList, ignore_index=True)

    return outputList

KNN_compare_variables([["pdel","iso_point"], ["pdel","deldel_G3"], ["iso_point", "deldel_G3"], ["iso_point", "pdel"]],3)

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"[pdel, iso_point]",0.925,0.11565,1.0,0.892086,0.942966,0.160094,[[248 0]\n [ 30 122]]
1,KNeighborsClassifier(n_neighbors=3),"[pdel, deldel_G3]",0.8525,0.107209,0.887597,0.88417,0.88588,0.14951,[[229 29]\n [ 30 112]]
2,KNeighborsClassifier(n_neighbors=3),"[iso_point, deldel_G3]",0.9225,0.075788,1.0,0.887273,0.94027,0.164008,[[244 0]\n [ 31 125]]
3,KNeighborsClassifier(n_neighbors=3),"[iso_point, pdel]",0.935,0.08231,1.0,0.909408,0.952555,0.146051,[[261 0]\n [ 26 113]]


# LDA:

#### Function to find summaries:

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def ML_LDA_summaries(Vars):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 

    #standardize data
    X = standardize(X)

    #set the model
    model = LinearDiscriminantAnalysis()

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": model,
                                  "X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    return dfOutput


In [17]:
ML_LDA_summaries(["pdel","iso_point"])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,LinearDiscriminantAnalysis(),"[pdel, iso_point]",0.7925,0.138542,0.917603,0.800654,0.855148,0.428621,[[245 22]\n [ 61 72]]


#### Function to compare variables:

In [18]:
def LDA_compare_variables(inputList):
    outputList = []

    for i in inputList:
        outputList.append(ML_LDA_summaries(i))

    outputList = pd.concat(outputList)

    return outputList

LDA_compare_variables([["pdel","iso_point"], ["pdel","deldel_G3"], ["iso_point", "deldel_G3"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,LinearDiscriminantAnalysis(),"[pdel, iso_point]",0.8225,0.115081,0.927757,0.824324,0.872987,0.413246,[[244 19]\n [ 52 85]]
0,LinearDiscriminantAnalysis(),"[pdel, deldel_G3]",0.875,0.106654,0.891051,0.912351,0.901575,0.396586,[[229 28]\n [ 22 121]]
0,LinearDiscriminantAnalysis(),"[iso_point, deldel_G3]",0.6225,0.127451,0.979592,0.621762,0.760697,0.393468,[[240 5]\n [146 9]]


# QDA

#### Function to find summaries:

In [20]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def ML_QDA_summaries(Vars):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 

    #standardize data
    X = standardize(X)

    #set the model
    model = QuadraticDiscriminantAnalysis()

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": model,
                                  "X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    return dfOutput

In [22]:
ML_QDA_summaries(["pdel","iso_point"])

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.d

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,QuadraticDiscriminantAnalysis(),"[pdel, iso_point]",0.8275,0.158094,1.0,0.790909,0.883249,0.436242,[[261 0]\n [ 69 70]]


In [28]:
ML_QDA_summaries(["pdel","deldel_G3"])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,QuadraticDiscriminantAnalysis(),"[pdel, deldel_G3]",0.82,0.116619,0.879845,0.847015,0.863118,0.379911,[[227 31]\n [ 41 101]]


In [49]:
ML_QDA_summaries(["pdel", "positionAA_num"])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,QuadraticDiscriminantAnalysis(),"[pdel, positionAA_num]",0.83,0.095394,0.915663,0.829091,0.870229,0.37373,[[228 21]\n [ 47 104]]
