In [151]:
import pandas as pd
import numpy as np
import timeit #imports timeit module
from sklearn import preprocessing
from sklearn.model_selection import KFold, LeaveOneOut,cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score,precision_score,recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

def standardize(x): 
    
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df1 = pd.DataFrame(x_scaled)
    return(df1)

def ML_summary(Vars, model = None):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 

    #standardize data
    X = standardize(X)

    #set the model
    if model == None:
        raise Exception("Needs Model")

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": str(model),
                                  "X Variable(s)": [str(Vars),],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    return dfOutput


def ML_compare_variables(modelList,variableList):
    
    outputList = []
    
    for j in modelList:
        
        outputList1 = []
        
        for i in variableList :
            outputList1.append(ML_summary(i,j))
            
        outputList1 = pd.concat(outputList1, ignore_index = True)
        
        outputList.append(outputList1)
    
    outputList = pd.concat(outputList, ignore_index = True)
    
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

In [144]:
df = pd.read_csv('data/Cataract Data 1.3.csv', na_values='?').dropna()
pd.set_option('display.max_rows', None)

In [None]:
#summery of all vars
ML_summary(["pdel","iso_point","deldel_G1","deldel_G2","deldel_G3","positionAA_num","solv_area","evo_age"],RandomForestClassifier(max_depth=3))

#Compare all variables
ML_compare_variables([LogisticRegression(solver= 'liblinear', class_weight = 'balanced'),
                      neighbors.KNeighborsClassifier(n_neighbors = 3),
                      LinearDiscriminantAnalysis(),
                      DecisionTreeClassifier(max_depth=3),
                      BaggingClassifier(),
                      RandomForestClassifier(max_depth = 3),
                      AdaBoostClassifier(n_estimators=500, learning_rate = 0.1, algorithm="SAMME.R"),
                      GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = 4),
                      SVC(C=1, kernel='linear')],
                           [["pdel","iso_point","deldel_G1","deldel_G2","deldel_G3","positionAA_num","solv_area","evo_age"]])

# Logistic Regression:

In [157]:
ML_compare_variables([LogisticRegression(solver= 'liblinear', class_weight = 'balanced')],
                     [["iso_point"],
                      ["pdel"],
                      ["iso_point","pdel"],
                      ["solv_area"],
                      ["iso_point","solv_area"],
                      ["iso_point","pdel","deldel_G3"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
5,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'pdel', 'deldel_G3']",0.7075,0.11702,0.690037,0.85,0.761711,0.124207,[[187 84]\n [ 33 96]]
2,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'pdel']",0.6925,0.147288,0.619048,0.852459,0.717241,0.122653,[[156 96]\n [ 27 121]]
1,"LogisticRegression(class_weight='balanced', so...",['pdel'],0.6575,0.197342,0.539623,0.905063,0.676123,0.12281,[[143 122]\n [ 15 120]]
0,"LogisticRegression(class_weight='balanced', so...",['iso_point'],0.5325,0.362896,0.625468,0.65748,0.641075,0.127373,[[167 100]\n [ 87 46]]
3,"LogisticRegression(class_weight='balanced', so...",['solv_area'],0.4925,0.147288,0.466667,0.681081,0.553846,0.123021,[[126 144]\n [ 59 71]]
4,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'solv_area']",0.4725,0.151637,0.52,0.588235,0.552017,0.124636,[[130 120]\n [ 91 59]]


# KNN:

In [145]:
def KNN_find_best_k(inputList,_max):
    
    outputList = [pd.DataFrame(data={"X Variable(s)": [inputList,],
                                      "Avg Accuracy": 0.0,
                                      "SD of Accuracy": 0.0,
                                      "Sensitivity": 0.0,
                                      "Precision": 0.0,
                                      "F1-Score": 0.0,
                                      "Runtime": 0.0,
                                      "Confusion_Matrix": "none"})] #,"Real Specificity": [1,]
    
    for i in range(1,_max):
        
        tempList = []
        
        for j in range(10):
            tempList.append(ML_summary(inputList,neighbors.KNeighborsClassifier(n_neighbors = i)))
            
        tempList = pd.concat(tempList, ignore_index=True)

        outputList.append(tempList.head(1))

    outputList = pd.concat(outputList, ignore_index=True)
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

KNN_find_best_k(["deldel_G3","iso_point"],5)

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix,Model
3,"['deldel_G3', 'iso_point']",0.93,0.09,1.0,0.906667,0.951049,0.156956,[[272 0]\n [ 28 100]],KNeighborsClassifier(n_neighbors=3)
2,"['deldel_G3', 'iso_point']",0.92,0.11,1.0,0.886926,0.940075,0.158927,[[251 0]\n [ 32 117]],KNeighborsClassifier(n_neighbors=2)
4,"['deldel_G3', 'iso_point']",0.8725,0.107209,1.0,0.837061,0.911304,0.157568,[[262 0]\n [ 51 87]],KNeighborsClassifier(n_neighbors=4)
1,"['deldel_G3', 'iso_point']",0.8675,0.081815,0.92803,0.878136,0.902394,0.164492,[[245 19]\n [ 34 102]],KNeighborsClassifier(n_neighbors=1)
0,"[deldel_G3, iso_point]",0.0,0.0,0.0,0.0,0.0,0.0,none,


In [None]:
ML_compare_variables([neighbors.KNeighborsClassifier(n_neighbors = 3)],[[],[],[],[],[]])

# LDA:

In [162]:
ML_compare_variables([LinearDiscriminantAnalysis()],
                     [["iso_point","pdel"],
                      ["iso_point"],
                      ["pdel"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
2,LinearDiscriminantAnalysis(),['pdel'],0.895,0.083516,0.938224,0.903346,0.920455,0.132942,[[243 16]\n [ 26 115]]
0,LinearDiscriminantAnalysis(),"['iso_point', 'pdel']",0.8225,0.119347,0.916,0.820789,0.865784,0.141594,[[229 21]\n [ 50 100]]
1,LinearDiscriminantAnalysis(),['iso_point'],0.625,0.119896,1.0,0.617347,0.763407,0.132677,[[242 0]\n [150 8]]


# Basic Decision Tree:

6 is best max_depth

In [170]:
def basticDT_find_best_k(inputList,_max):
    
    outputList = [pd.DataFrame(data={"X Variable(s)": [inputList,],
                                      "Avg Accuracy": 0.0,
                                      "SD of Accuracy": 0.0,
                                      "Sensitivity": 0.0,
                                      "Precision": 0.0,
                                      "F1-Score": 0.0,
                                      "Runtime": 0.0,
                                      "Confusion_Matrix": "none"})] #,"Real Specificity": [1,]
    
    for i in range(1,_max):
        
        tempList = []
        
        for j in range(20):
            tempList.append(ML_summary(inputList,DecisionTreeClassifier(max_depth=i)))
            
        tempList = pd.concat(tempList, ignore_index=True)

        outputList.append(tempList.head(1))

    outputList = pd.concat(outputList, ignore_index=True)
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

basticDT_find_best_k(["pdel","iso_point","deldel_G1","deldel_G2","deldel_G3","positionAA_num","solv_area","evo_age"],10)

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix,Model
2,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.87,0.11225,0.914498,0.894545,0.904412,0.124251,[[246 23]\n [ 29 102]],DecisionTreeClassifier(max_depth=2)
6,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.8775,0.101211,0.901575,0.905138,0.903353,0.124536,[[229 25]\n [ 24 122]],DecisionTreeClassifier(max_depth=6)
7,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.86,0.10198,0.903346,0.89011,0.896679,0.11977,[[243 26]\n [ 30 101]],DecisionTreeClassifier(max_depth=7)
8,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.8575,0.099718,0.906015,0.882784,0.894249,0.122381,[[241 25]\n [ 32 102]],DecisionTreeClassifier(max_depth=8)
4,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.865,0.115217,0.890196,0.897233,0.893701,0.120866,[[227 28]\n [ 26 119]],DecisionTreeClassifier(max_depth=4)
5,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.855,0.126392,0.896552,0.883019,0.889734,0.119316,[[234 27]\n [ 31 108]],DecisionTreeClassifier(max_depth=5)
3,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.855,0.113908,0.887597,0.887597,0.887597,0.123866,[[229 29]\n [ 29 113]],DecisionTreeClassifier(max_depth=3)
9,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.84,0.091652,0.86166,0.882591,0.872,0.120278,[[218 35]\n [ 29 118]],DecisionTreeClassifier(max_depth=9)
1,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.7625,0.101704,0.836735,0.788462,0.811881,0.127687,[[205 40]\n [ 55 100]],DecisionTreeClassifier(max_depth=1)
0,"[pdel, iso_point, deldel_G1, deldel_G2, deldel...",0.0,0.0,0.0,0.0,0.0,0.0,none,


In [179]:
ML_summary(["pdel","iso_point","deldel_G1","deldel_G2","deldel_G3","positionAA_num","solv_area","evo_age"],DecisionTreeClassifier(max_depth=6))

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,DecisionTreeClassifier(max_depth=6),"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.865,0.090967,0.88764,0.908046,0.897727,0.126963,[[237 30]\n [ 24 109]]


# Bagging Decision Tree:

# Random Forest:

# Ada Boosting Decision Tree:

# GradientBoosting Decision Tree:

# Support Vector Classifier:
Best kernal = "rbf" (default)

In [193]:
ML_compare_variables([SVC(C=1, kernel='linear'),SVC(C=1, kernel='poly'),SVC(C=1, kernel='rbf'),SVC(C=1, kernel='sigmoid')],[["iso_point","pdel"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
2,SVC(C=1),"['iso_point', 'pdel']",0.9425,0.058683,1.0,0.918728,0.957643,0.124202,[[260 0]\n [ 23 117]]
1,"SVC(C=1, kernel='poly')","['iso_point', 'pdel']",0.905,0.083516,0.946154,0.911111,0.928302,0.125204,[[246 14]\n [ 24 116]]
3,"SVC(C=1, kernel='sigmoid')","['iso_point', 'pdel']",0.6875,0.122857,1.0,0.6875,0.814815,0.125045,[[275 0]\n [125 0]]
0,"SVC(C=1, kernel='linear')","['iso_point', 'pdel']",0.65,0.11619,0.988506,0.653165,0.786585,0.129091,[[258 3]\n [137 2]]


In [205]:
def SVC_find_best_c(inputList,_max):
    
    outputList = [pd.DataFrame(data={"X Variable(s)": [inputList,],
                                      "Avg Accuracy": 0.0,
                                      "SD of Accuracy": 0.0,
                                      "Sensitivity": 0.0,
                                      "Precision": 0.0,
                                      "F1-Score": 0.0,
                                      "Runtime": 0.0,
                                      "Confusion_Matrix": "none"})] #,"Real Specificity": [1,]
    
    for i in range(1,_max):
        
        tempList = []
        
        for j in range(10):
            tempList.append(ML_summary(inputList,SVC(C=i, kernel='rbf')))
            
        tempList = pd.concat(tempList, ignore_index=True)

        outputList.append(tempList.head(1))

    outputList = pd.concat(outputList, ignore_index=True)
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

SVC_find_best_c(["iso_point","pdel"],10)

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix,Model
7,"['iso_point', 'pdel']",0.9625,0.061998,0.99262,0.953901,0.972875,0.124815,[[269 2]\n [ 13 116]],SVC(C=7)
6,"['iso_point', 'pdel']",0.9525,0.063196,0.992218,0.9375,0.964083,0.128732,[[255 2]\n [ 17 126]],SVC(C=6)
2,"['iso_point', 'pdel']",0.95,0.070711,0.984791,0.941818,0.962825,0.128206,[[259 4]\n [ 16 121]],SVC(C=2)
9,"['iso_point', 'pdel']",0.945,0.063048,1.0,0.925926,0.961538,0.127473,[[275 0]\n [ 22 103]],SVC(C=9)
4,"['iso_point', 'pdel']",0.945,0.09206,0.992366,0.928571,0.95941,0.126926,[[260 2]\n [ 20 118]],SVC(C=4)
8,"['iso_point', 'pdel']",0.9425,0.09189,0.992188,0.923636,0.956685,0.126808,[[254 2]\n [ 21 123]],SVC(C=8)
3,"['iso_point', 'pdel']",0.94,0.066332,1.0,0.915789,0.956044,0.124866,[[261 0]\n [ 24 115]],SVC(C=3)
1,"['iso_point', 'pdel']",0.9275,0.089408,0.992395,0.90625,0.947368,0.130629,[[261 2]\n [ 27 110]],SVC(C=1)
5,"['iso_point', 'pdel']",0.9275,0.077419,0.977011,0.917266,0.946197,0.126839,[[255 6]\n [ 23 116]],SVC(C=5)
0,"[iso_point, pdel]",0.0,0.0,0.0,0.0,0.0,0.0,none,
