In [2]:
import pandas as pd
import numpy as np
import timeit #imports timeit module
from sklearn import preprocessing
from sklearn.model_selection import KFold, LeaveOneOut,cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score,precision_score,recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

def standardize(x): 
    
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df1 = pd.DataFrame(x_scaled)
    return(df1)

def ML_summary(Vars, model = None):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 

    #standardize data
    X = standardize(X)

    #set the model
    if model == None:
        raise Exception("Needs Model")

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"Model": str(model),
                                  "X Variable(s)": [str(Vars),],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    return dfOutput


def ML_compare_variables(modelList,variableList):
    
    outputList = []
    
    for j in modelList:
        
        outputList1 = []
        
        for i in variableList :
            outputList1.append(ML_summary(i,j))
            
        outputList1 = pd.concat(outputList1, ignore_index = True)
        
        outputList.append(outputList1)
    
    outputList = pd.concat(outputList, ignore_index = True)
    
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

In [4]:
df = pd.read_csv('Cataract Data 1.3.csv', na_values='?').dropna()
pd.set_option('display.max_rows', None)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["ogAA_letterCode"] = le.fit_transform(df.ogAA_letter)
df["mutAA_letterCode"] = le.fit_transform(df.mutAA_letter)
df["ogAA_charCode"] = le.fit_transform(df.ogAA_char)
df["mutAA_charCode"] = le.fit_transform(df.mutAA_char)

In [5]:
#summery of all vars
ML_summary(["pdel","iso_point","deldel_G1","deldel_G2","deldel_G3","positionAA_num","solv_area","evo_age"],RandomForestClassifier(max_depth=3))

#Compare all variables
ML_compare_variables([LogisticRegression(solver= 'liblinear', class_weight = 'balanced'),
                      neighbors.KNeighborsClassifier(n_neighbors = 3),
                      LinearDiscriminantAnalysis(),
                      DecisionTreeClassifier(max_depth=3),
                      BaggingClassifier(),
                      RandomForestClassifier(max_depth = 3),
                      AdaBoostClassifier(n_estimators=500, learning_rate = 0.1, algorithm="SAMME.R"),
                      GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = 4),
                      SVC(C=1, kernel='linear')],
                           [["pdel","iso_point","deldel_G1","deldel_G2","deldel_G3","positionAA_num","solv_area","evo_age"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
5,RandomForestClassifier(max_depth=3),"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.91,0.073485,0.97479,0.885496,0.928,6.997925,[[232 6]\n [ 30 132]]
3,DecisionTreeClassifier(max_depth=3),"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.88,0.087178,0.90146,0.921642,0.911439,0.160103,[[247 27]\n [ 21 105]]
6,"AdaBoostClassifier(learning_rate=0.1, n_estima...","['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.8775,0.110651,0.912214,0.901887,0.907021,35.04351,[[239 23]\n [ 26 112]]
4,BaggingClassifier(),"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.875,0.096825,0.917323,0.889313,0.903101,0.99337,[[233 21]\n [ 29 117]]
7,"GradientBoostingClassifier(max_depth=4, n_esti...","['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.85,0.097468,0.900383,0.873606,0.886792,9.538576,[[235 26]\n [ 34 105]]
1,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.8175,0.118084,0.984674,0.788344,0.875639,0.235147,[[257 4]\n [ 69 70]]
8,"SVC(C=1, kernel='linear')","['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.8175,0.133954,0.976654,0.789308,0.873043,0.197954,[[251 6]\n [ 67 76]]
2,LinearDiscriminantAnalysis(),"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.7725,0.109516,0.862903,0.789668,0.824663,0.284243,[[214 34]\n [ 57 95]]
0,"LogisticRegression(class_weight='balanced', so...","['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.7425,0.132075,0.734615,0.848889,0.787629,0.213992,[[191 69]\n [ 34 106]]


# Logistic Regression:

In [6]:
ML_compare_variables([LogisticRegression(solver= 'liblinear', class_weight = 'balanced')],
                     [["iso_point"],
                      ["pdel"],
                      ["iso_point","pdel"],
                      ["solv_area"],
                      ["iso_point","solv_area"],
                      ["iso_point","pdel","deldel_G3"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,"LogisticRegression(class_weight='balanced', so...",['iso_point'],0.6825,0.294013,0.819853,0.740864,0.77836,0.179054,[[223 49]\n [ 78 50]]
2,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'pdel']",0.7075,0.152295,0.678295,0.837321,0.749465,0.155631,[[175 83]\n [ 34 108]]
5,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'pdel', 'deldel_G3']",0.695,0.135923,0.726908,0.770213,0.747934,0.182973,[[181 68]\n [ 54 97]]
1,"LogisticRegression(class_weight='balanced', so...",['pdel'],0.6825,0.173043,0.595588,0.905028,0.718404,0.170023,[[162 110]\n [ 17 111]]
4,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'solv_area']",0.56,0.185472,0.598456,0.682819,0.63786,0.175806,[[155 104]\n [ 72 69]]
3,"LogisticRegression(class_weight='balanced', so...",['solv_area'],0.48,0.130767,0.409836,0.609756,0.490196,0.172417,[[100 144]\n [ 64 92]]


In [32]:
ML_compare_variables([LogisticRegression(solver= 'liblinear', class_weight = 'balanced')],[["iso_point", "pdel"],
                                                     ["iso_point", "solv_area"],
                                                     ["iso_point", "deldel_G1"],
                                                     ["iso_point", "deldel_G2"],
                                                     ["iso_point", "deldel_G3"],
                                                     ["iso_point", "ogAA_letterCode"],
                                                     ["pdel", "ogAA_letterCode"],
                                                     ["solv_area", "ogAA_letterCode"],
                                                     ["deldel_G3", "ogAA_letterCode"],
                                                     ["deldel_G2", "ogAA_letterCode"],
                                                     ["deldel_G1", "ogAA_letterCode"],
                                                     ["pdel", "deldel_G1"],
                                                     ["pdel", "deldel_G2"],
                                                     ["pdel", "deldel_G3"],
                                                     ["pdel", "solv_area"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
13,"LogisticRegression(class_weight='balanced', so...","['pdel', 'deldel_G3']",0.7275,0.104851,0.703297,0.872727,0.778905,0.111096,[[192 81]\n [ 28 99]]
0,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'pdel']",0.72,0.12083,0.671937,0.854271,0.752212,0.128899,[[170 83]\n [ 29 118]]
11,"LogisticRegression(class_weight='balanced', so...","['pdel', 'deldel_G1']",0.7125,0.116592,0.665306,0.831633,0.739229,0.102095,[[163 82]\n [ 33 122]]
4,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'deldel_G3']",0.575,0.197167,0.739777,0.665552,0.700704,0.111867,[[199 70]\n [100 31]]
6,"LogisticRegression(class_weight='balanced', so...","['pdel', 'ogAA_letterCode']",0.655,0.162711,0.555556,0.892857,0.684932,0.104066,[[150 120]\n [ 18 112]]
8,"LogisticRegression(class_weight='balanced', so...","['deldel_G3', 'ogAA_letterCode']",0.56,0.117898,0.67148,0.686347,0.678832,0.143116,[[186 91]\n [ 85 38]]
12,"LogisticRegression(class_weight='balanced', so...","['pdel', 'deldel_G2']",0.655,0.149917,0.543726,0.888199,0.674528,0.108414,[[143 120]\n [ 18 119]]
5,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'ogAA_letterCode']",0.5575,0.192208,0.709302,0.642105,0.674033,0.104357,[[183 75]\n [102 40]]
14,"LogisticRegression(class_weight='balanced', so...","['pdel', 'solv_area']",0.64,0.157797,0.498069,0.902098,0.641791,0.124133,[[129 130]\n [ 14 127]]
1,"LogisticRegression(class_weight='balanced', so...","['iso_point', 'solv_area']",0.525,0.189407,0.596899,0.641667,0.618474,0.123548,[[154 104]\n [ 86 56]]


# KNN:

In [20]:
def KNN_find_best_k(inputList,_max):
    
    outputList = [pd.DataFrame(data={"X Variable(s)": [inputList,],
                                      "Avg Accuracy": 0.0,
                                      "SD of Accuracy": 0.0,
                                      "Sensitivity": 0.0,
                                      "Precision": 0.0,
                                      "F1-Score": 0.0,
                                      "Runtime": 0.0,
                                      "Confusion_Matrix": "none"})] #,"Real Specificity": [1,]
    
    for i in range(1,_max):
        
        tempList = []
        
        for j in range(10):
            tempList.append(ML_summary(inputList,neighbors.KNeighborsClassifier(n_neighbors = i)))
            
        tempList = pd.concat(tempList, ignore_index=True)

        outputList.append(tempList.head(1))

    outputList = pd.concat(outputList, ignore_index=True)
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

KNN_find_best_k(["deldel_G3","iso_point"],5)

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix,Model
3,"['deldel_G3', 'iso_point']",0.94,0.086023,1.0,0.916376,0.956364,0.130837,[[263 0]\n [ 24 113]],KNeighborsClassifier(n_neighbors=3)
2,"['deldel_G3', 'iso_point']",0.9275,0.083629,1.0,0.903654,0.949389,0.129888,[[272 0]\n [ 29 99]],KNeighborsClassifier(n_neighbors=2)
4,"['deldel_G3', 'iso_point']",0.8875,0.114428,1.0,0.851974,0.920071,0.143085,[[259 0]\n [ 45 96]],KNeighborsClassifier(n_neighbors=4)
1,"['deldel_G3', 'iso_point']",0.865,0.088176,0.918288,0.877323,0.897338,0.148689,[[236 21]\n [ 33 110]],KNeighborsClassifier(n_neighbors=1)
0,"[deldel_G3, iso_point]",0.0,0.0,0.0,0.0,0.0,0.0,none,


In [18]:
ML_compare_variables([neighbors.KNeighborsClassifier(n_neighbors = 3)],[["iso_point", "pdel"],
                                                                        ["iso_point", "solv_area"],
                                                                        ["iso_point", "deldel_G1"],
                                                                        ["iso_point", "deldel_G2"],
                                                                        ["iso_point", "deldel_G3"],
                                                                        ["iso_point", "ogAA_letterCode"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
5,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'ogAA_letterCode']",0.9575,0.058683,1.0,0.939716,0.968921,0.137418,[[265 0]\n [ 17 118]]
4,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G3']",0.94,0.08,1.0,0.915789,0.956044,0.130572,[[261 0]\n [ 24 115]]
0,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'pdel']",0.9375,0.085696,1.0,0.911972,0.953959,0.155871,[[259 0]\n [ 25 116]]
2,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G1']",0.9225,0.075788,1.0,0.888889,0.941176,0.127302,[[248 0]\n [ 31 121]]
1,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'solv_area']",0.9175,0.086277,1.0,0.883392,0.938086,0.128799,[[250 0]\n [ 33 117]]
3,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G2']",0.8975,0.068875,1.0,0.862876,0.926391,0.16682,[[258 0]\n [ 41 101]]


In [13]:
ML_compare_variables([neighbors.KNeighborsClassifier(n_neighbors = 3)],[["pdel", "ogAA_letterCode"],
                                                                        ["solv_area", "ogAA_letterCode"],
                                                                        ["deldel_G3", "ogAA_letterCode"],
                                                                        ["deldel_G2", "ogAA_letterCode"],
                                                                        ["deldel_G1", "ogAA_letterCode"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"['pdel', 'ogAA_letterCode']",0.855,0.104762,0.89434,0.88764,0.890977,0.13212,[[237 28]\n [ 30 105]]
4,KNeighborsClassifier(n_neighbors=3),"['deldel_G1', 'ogAA_letterCode']",0.745,0.10943,0.821429,0.784091,0.802326,0.146196,[[207 45]\n [ 57 91]]
1,KNeighborsClassifier(n_neighbors=3),"['solv_area', 'ogAA_letterCode']",0.7375,0.131696,0.802281,0.799242,0.800759,0.156527,[[211 52]\n [ 53 84]]
2,KNeighborsClassifier(n_neighbors=3),"['deldel_G3', 'ogAA_letterCode']",0.6925,0.121218,0.82397,0.743243,0.781528,0.191291,[[220 47]\n [ 76 57]]
3,KNeighborsClassifier(n_neighbors=3),"['deldel_G2', 'ogAA_letterCode']",0.6925,0.136725,0.755556,0.781609,0.768362,0.172911,[[204 66]\n [ 57 73]]


In [21]:
ML_compare_variables([neighbors.KNeighborsClassifier(n_neighbors = 3)],[["pdel", "iso_point"],
                                                                        ["pdel", "deldel_G1"],
                                                                        ["pdel", "deldel_G2"],
                                                                        ["pdel", "deldel_G3"],
                                                                        ["pdel", "solv_area"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point']",0.9475,0.067035,1.0,0.92446,0.960748,0.135181,[[257 0]\n [ 21 122]]
1,KNeighborsClassifier(n_neighbors=3),"['pdel', 'deldel_G1']",0.84,0.091652,0.905138,0.851301,0.877395,0.133344,[[229 24]\n [ 40 107]]
4,KNeighborsClassifier(n_neighbors=3),"['pdel', 'solv_area']",0.8225,0.096144,0.885375,0.842105,0.863198,0.137828,[[224 29]\n [ 42 105]]
3,KNeighborsClassifier(n_neighbors=3),"['pdel', 'deldel_G3']",0.8275,0.086566,0.85098,0.875,0.862823,0.123834,[[217 38]\n [ 31 114]]
2,KNeighborsClassifier(n_neighbors=3),"['pdel', 'deldel_G2']",0.815,0.096307,0.883721,0.838235,0.860377,0.17771,[[228 30]\n [ 44 98]]


In [51]:
ML_compare_variables([neighbors.KNeighborsClassifier(n_neighbors = 3)],[["pdel", "iso_point", "deldel_G3"],
                                                                        ["pdel", "iso_point", "solv_area"],
                                                                        ["pdel", "iso_point", "ogAA_letterCode"],
                                                                        ["pdel", "iso_point", "deldel_G1"],
                                                                        ["pdel", "iso_point", "deldel_G2"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
2,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'ogAA_letterCode']",0.9475,0.059108,1.0,0.929054,0.963222,0.17863,[[275 0]\n [ 21 104]]
3,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'deldel_G1']",0.9475,0.086566,1.0,0.928328,0.962832,0.143992,[[272 0]\n [ 21 107]]
0,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'deldel_G3']",0.9425,0.080273,1.0,0.917857,0.957169,0.165192,[[257 0]\n [ 23 120]]
4,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'deldel_G2']",0.9325,0.0787,1.0,0.906574,0.950998,0.140353,[[262 0]\n [ 27 111]]
1,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'solv_area']",0.92,0.09,1.0,0.892617,0.943262,0.13849,[[266 0]\n [ 32 102]]


In [49]:
ML_compare_variables([neighbors.KNeighborsClassifier(n_neighbors = 3)],[["iso_point", "pdel"],
                                                     ["iso_point", "solv_area"],
                                                     ["iso_point", "deldel_G1"],
                                                     ["iso_point", "deldel_G2"],
                                                     ["iso_point", "deldel_G3"],
                                                     ["iso_point", "ogAA_letterCode"],
                                                     ["pdel", "ogAA_letterCode"],
                                                     ["solv_area", "ogAA_letterCode"],
                                                     ["deldel_G3", "ogAA_letterCode"],
                                                     ["deldel_G2", "ogAA_letterCode"],
                                                     ["deldel_G1", "ogAA_letterCode"],
                                                     ["pdel", "deldel_G1"],
                                                     ["pdel", "deldel_G2"],
                                                     ["pdel", "deldel_G3"],
                                                     ["pdel", "solv_area"],
                                                     ["pdel", "iso_point"],
                                                     ["solv_area", "deldel_G1"],
                                                     ["solv_area", "deldel_G2"],
                                                     ["solv_area", "deldel_G3"],
                                                     ["solv_area", "pdel"],
                                                     ["solv_area", "iso_point"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'pdel']",0.955,0.058949,1.0,0.93617,0.967033,0.151299,[[264 0]\n [ 18 118]]
4,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G3']",0.945,0.070534,1.0,0.922261,0.959559,0.156136,[[261 0]\n [ 22 117]]
20,KNeighborsClassifier(n_neighbors=3),"['solv_area', 'iso_point']",0.935,0.096307,1.0,0.912162,0.954064,0.143282,[[270 0]\n [ 26 104]]
15,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point']",0.94,0.091652,1.0,0.911111,0.953488,0.148709,[[246 0]\n [ 24 130]]
2,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G1']",0.935,0.072629,1.0,0.910345,0.953069,0.167611,[[264 0]\n [ 26 110]]
1,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'solv_area']",0.93,0.092736,1.0,0.905085,0.950178,0.13537,[[267 0]\n [ 28 105]]
5,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'ogAA_letterCode']",0.93,0.095394,1.0,0.903114,0.949091,0.153571,[[261 0]\n [ 28 111]]
3,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G2']",0.9275,0.07412,1.0,0.902027,0.94849,0.165324,[[267 0]\n [ 29 104]]
14,KNeighborsClassifier(n_neighbors=3),"['pdel', 'solv_area']",0.86,0.07,0.890152,0.896947,0.893536,0.124204,[[235 29]\n [ 27 109]]
6,KNeighborsClassifier(n_neighbors=3),"['pdel', 'ogAA_letterCode']",0.85,0.097468,0.894942,0.874525,0.884615,0.151983,[[230 27]\n [ 33 110]]


In [52]:
ML_compare_variables([neighbors.KNeighborsClassifier(n_neighbors = 3)],[["iso_point", "pdel"],
                                                     ["iso_point", "solv_area"],
                                                     ["iso_point", "deldel_G1"],
                                                     ["iso_point", "deldel_G2"],
                                                     ["iso_point", "deldel_G3"],
                                                     ["iso_point", "ogAA_letterCode"],
                                                     ["pdel", "ogAA_letterCode"],
                                                     ["solv_area", "ogAA_letterCode"],
                                                     ["deldel_G3", "ogAA_letterCode"],
                                                     ["deldel_G2", "ogAA_letterCode"],
                                                     ["deldel_G1", "ogAA_letterCode"],
                                                     ["pdel", "deldel_G1"],
                                                     ["pdel", "deldel_G2"],
                                                     ["pdel", "deldel_G3"],
                                                     ["pdel", "solv_area"],
                                                     ["pdel", "iso_point"],
                                                     ["solv_area", "deldel_G1"],
                                                     ["solv_area", "deldel_G2"],
                                                     ["solv_area", "deldel_G3"],
                                                     ["solv_area", "pdel"],
                                                     ["solv_area", "iso_point"],
                                                     ["pdel", "iso_point", "deldel_G3"],
                                                     ["pdel", "iso_point", "solv_area"],
                                                     ["pdel", "iso_point", "ogAA_letterCode"],
                                                     ["pdel", "iso_point", "deldel_G1"],
                                                     ["pdel", "iso_point", "deldel_G2"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
15,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point']",0.955,0.054544,1.0,0.934307,0.966038,0.190341,[[256 0]\n [ 18 126]]
2,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G1']",0.955,0.058949,1.0,0.934066,0.965909,0.155681,[[255 0]\n [ 18 127]]
20,KNeighborsClassifier(n_neighbors=3),"['solv_area', 'iso_point']",0.9525,0.067035,1.0,0.934028,0.965889,0.161909,[[269 0]\n [ 19 112]]
23,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'ogAA_letterCode']",0.9475,0.054715,1.0,0.927083,0.962162,0.199744,[[267 0]\n [ 21 112]]
0,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'pdel']",0.9425,0.080273,1.0,0.920415,0.958559,0.152128,[[266 0]\n [ 23 111]]
4,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G3']",0.94,0.083066,1.0,0.915493,0.955882,0.156344,[[260 0]\n [ 24 116]]
24,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'deldel_G1']",0.94,0.05831,1.0,0.914286,0.955224,0.231688,[[256 0]\n [ 24 120]]
22,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'solv_area']",0.9325,0.068511,1.0,0.905263,0.950276,0.192069,[[258 0]\n [ 27 115]]
5,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'ogAA_letterCode']",0.9325,0.064759,1.0,0.900369,0.947573,0.157122,[[244 0]\n [ 27 129]]
21,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point', 'deldel_G3']",0.9275,0.070666,1.0,0.899654,0.947177,0.166522,[[260 0]\n [ 29 111]]


# LDA:

In [162]:
ML_compare_variables([LinearDiscriminantAnalysis()],[["iso_point","pdel"],
                                                     ["iso_point"],
                                                     ["pdel"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
2,LinearDiscriminantAnalysis(),['pdel'],0.895,0.083516,0.938224,0.903346,0.920455,0.132942,[[243 16]\n [ 26 115]]
0,LinearDiscriminantAnalysis(),"['iso_point', 'pdel']",0.8225,0.119347,0.916,0.820789,0.865784,0.141594,[[229 21]\n [ 50 100]]
1,LinearDiscriminantAnalysis(),['iso_point'],0.625,0.119896,1.0,0.617347,0.763407,0.132677,[[242 0]\n [150 8]]


In [48]:
ML_compare_variables([LinearDiscriminantAnalysis()],[["iso_point", "pdel"],
                                                     ["iso_point", "solv_area"],
                                                     ["iso_point", "deldel_G1"],
                                                     ["iso_point", "deldel_G2"],
                                                     ["iso_point", "deldel_G3"],
                                                     ["iso_point", "ogAA_letterCode"],
                                                     ["pdel", "ogAA_letterCode"],
                                                     ["solv_area", "ogAA_letterCode"],
                                                     ["deldel_G3", "ogAA_letterCode"],
                                                     ["deldel_G2", "ogAA_letterCode"],
                                                     ["deldel_G1", "ogAA_letterCode"],
                                                     ["pdel", "deldel_G1"],
                                                     ["pdel", "deldel_G2"],
                                                     ["pdel", "deldel_G3"],
                                                     ["pdel", "solv_area"],
                                                     ["pdel", "iso_point"],
                                                     ["solv_area", "deldel_G1"],
                                                     ["solv_area", "deldel_G2"],
                                                     ["solv_area", "deldel_G3"],
                                                     ["solv_area", "pdel"],
                                                     ["solv_area", "iso_point"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
11,LinearDiscriminantAnalysis(),"['pdel', 'deldel_G1']",0.875,0.101858,0.891791,0.919231,0.905303,0.14661,[[239 29]\n [ 21 111]]
13,LinearDiscriminantAnalysis(),"['pdel', 'deldel_G3']",0.87,0.09,0.902985,0.902985,0.902985,0.137523,[[242 26]\n [ 26 106]]
12,LinearDiscriminantAnalysis(),"['pdel', 'deldel_G2']",0.8675,0.105801,0.903101,0.89272,0.897881,0.126438,[[233 25]\n [ 28 114]]
19,LinearDiscriminantAnalysis(),"['solv_area', 'pdel']",0.8625,0.104133,0.89272,0.896154,0.894434,0.129302,[[233 28]\n [ 27 112]]
6,LinearDiscriminantAnalysis(),"['pdel', 'ogAA_letterCode']",0.8375,0.12387,0.883721,0.86692,0.87524,0.130291,[[228 30]\n [ 35 107]]
14,LinearDiscriminantAnalysis(),"['pdel', 'solv_area']",0.825,0.117792,0.87395,0.83871,0.855967,0.155416,[[208 30]\n [ 40 122]]
15,LinearDiscriminantAnalysis(),"['pdel', 'iso_point']",0.8025,0.131315,0.920949,0.797945,0.855046,0.122178,[[233 20]\n [ 59 88]]
0,LinearDiscriminantAnalysis(),"['iso_point', 'pdel']",0.81,0.151327,0.925311,0.793594,0.854406,0.130255,[[223 18]\n [ 58 101]]
4,LinearDiscriminantAnalysis(),"['iso_point', 'deldel_G3']",0.675,0.11565,0.973485,0.676316,0.798137,0.110919,[[257 7]\n [123 13]]
9,LinearDiscriminantAnalysis(),"['deldel_G2', 'ogAA_letterCode']",0.6425,0.124273,0.921053,0.667575,0.774092,0.141547,[[245 21]\n [122 12]]


In [31]:
ML_compare_variables([LinearDiscriminantAnalysis()],[["pdel", "iso_point", "deldel_G3"],
                                                     ["pdel", "iso_point", "solv_area"],
                                                     ["pdel", "iso_point", "ogAA_letterCode"],
                                                     ["pdel", "iso_point", "deldel_G1"],
                                                     ["pdel", "iso_point", "deldel_G2"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
3,LinearDiscriminantAnalysis(),"['pdel', 'iso_point', 'deldel_G1']",0.8325,0.095884,0.885932,0.862963,0.874296,0.144218,[[233 30]\n [ 37 100]]
1,LinearDiscriminantAnalysis(),"['pdel', 'iso_point', 'solv_area']",0.81,0.128062,0.907749,0.828283,0.866197,0.118183,[[246 25]\n [ 51 78]]
4,LinearDiscriminantAnalysis(),"['pdel', 'iso_point', 'deldel_G2']",0.8175,0.118084,0.914397,0.821678,0.865562,0.147608,[[235 22]\n [ 51 92]]
2,LinearDiscriminantAnalysis(),"['pdel', 'iso_point', 'ogAA_letterCode']",0.805,0.135923,0.866935,0.826923,0.846457,0.157242,[[215 33]\n [ 45 107]]
0,LinearDiscriminantAnalysis(),"['pdel', 'iso_point', 'deldel_G3']",0.76,0.144568,0.897959,0.756014,0.820896,0.147014,[[220 25]\n [ 71 84]]


# Basic Decision Tree:

6 is best max_depth

In [170]:
def basticDT_find_best_k(inputList,_max):
    
    outputList = [pd.DataFrame(data={"X Variable(s)": [inputList,],
                                      "Avg Accuracy": 0.0,
                                      "SD of Accuracy": 0.0,
                                      "Sensitivity": 0.0,
                                      "Precision": 0.0,
                                      "F1-Score": 0.0,
                                      "Runtime": 0.0,
                                      "Confusion_Matrix": "none"})] #,"Real Specificity": [1,]
    
    for i in range(1,_max):
        
        tempList = []
        
        for j in range(20):
            tempList.append(ML_summary(inputList,DecisionTreeClassifier(max_depth=i)))
            
        tempList = pd.concat(tempList, ignore_index=True)

        outputList.append(tempList.head(1))

    outputList = pd.concat(outputList, ignore_index=True)
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

basticDT_find_best_k(["pdel","iso_point","deldel_G1","deldel_G2","deldel_G3","positionAA_num","solv_area","evo_age"],10)

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix,Model
2,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.87,0.11225,0.914498,0.894545,0.904412,0.124251,[[246 23]\n [ 29 102]],DecisionTreeClassifier(max_depth=2)
6,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.8775,0.101211,0.901575,0.905138,0.903353,0.124536,[[229 25]\n [ 24 122]],DecisionTreeClassifier(max_depth=6)
7,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.86,0.10198,0.903346,0.89011,0.896679,0.11977,[[243 26]\n [ 30 101]],DecisionTreeClassifier(max_depth=7)
8,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.8575,0.099718,0.906015,0.882784,0.894249,0.122381,[[241 25]\n [ 32 102]],DecisionTreeClassifier(max_depth=8)
4,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.865,0.115217,0.890196,0.897233,0.893701,0.120866,[[227 28]\n [ 26 119]],DecisionTreeClassifier(max_depth=4)
5,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.855,0.126392,0.896552,0.883019,0.889734,0.119316,[[234 27]\n [ 31 108]],DecisionTreeClassifier(max_depth=5)
3,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.855,0.113908,0.887597,0.887597,0.887597,0.123866,[[229 29]\n [ 29 113]],DecisionTreeClassifier(max_depth=3)
9,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.84,0.091652,0.86166,0.882591,0.872,0.120278,[[218 35]\n [ 29 118]],DecisionTreeClassifier(max_depth=9)
1,"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.7625,0.101704,0.836735,0.788462,0.811881,0.127687,[[205 40]\n [ 55 100]],DecisionTreeClassifier(max_depth=1)
0,"[pdel, iso_point, deldel_G1, deldel_G2, deldel...",0.0,0.0,0.0,0.0,0.0,0.0,none,


In [179]:
ML_summary(["pdel","iso_point","deldel_G1","deldel_G2","deldel_G3","positionAA_num","solv_area","evo_age"],DecisionTreeClassifier(max_depth=6))

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,DecisionTreeClassifier(max_depth=6),"['pdel', 'iso_point', 'deldel_G1', 'deldel_G2'...",0.865,0.090967,0.88764,0.908046,0.897727,0.126963,[[237 30]\n [ 24 109]]


# Bagging Decision Tree:

# Random Forest:

# Ada Boosting Decision Tree:

# GradientBoosting Decision Tree:

# Support Vector Classifier:
Best kernal = "rbf" (default), 6 is best C

In [193]:
ML_compare_variables([SVC(C=1, kernel='linear'),SVC(C=1, kernel='poly'),SVC(C=1, kernel='rbf'),SVC(C=1, kernel='sigmoid')],[["iso_point","pdel"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
2,SVC(C=1),"['iso_point', 'pdel']",0.9425,0.058683,1.0,0.918728,0.957643,0.124202,[[260 0]\n [ 23 117]]
1,"SVC(C=1, kernel='poly')","['iso_point', 'pdel']",0.905,0.083516,0.946154,0.911111,0.928302,0.125204,[[246 14]\n [ 24 116]]
3,"SVC(C=1, kernel='sigmoid')","['iso_point', 'pdel']",0.6875,0.122857,1.0,0.6875,0.814815,0.125045,[[275 0]\n [125 0]]
0,"SVC(C=1, kernel='linear')","['iso_point', 'pdel']",0.65,0.11619,0.988506,0.653165,0.786585,0.129091,[[258 3]\n [137 2]]


In [53]:
ML_compare_variables([SVC(C=1, kernel='rbf')],[["iso_point", "pdel"],
                                                     ["iso_point", "solv_area"],
                                                     ["iso_point", "deldel_G1"],
                                                     ["iso_point", "deldel_G2"],
                                                     ["iso_point", "deldel_G3"],
                                                     ["iso_point", "ogAA_letterCode"],
                                                     ["pdel", "ogAA_letterCode"],
                                                     ["solv_area", "ogAA_letterCode"],
                                                     ["deldel_G3", "ogAA_letterCode"],
                                                     ["deldel_G2", "ogAA_letterCode"],
                                                     ["deldel_G1", "ogAA_letterCode"],
                                                     ["pdel", "deldel_G1"],
                                                     ["pdel", "deldel_G2"],
                                                     ["pdel", "deldel_G3"],
                                                     ["pdel", "solv_area"],
                                                     ["pdel", "iso_point"],
                                                     ["solv_area", "deldel_G1"],
                                                     ["solv_area", "deldel_G2"],
                                                     ["solv_area", "deldel_G3"],
                                                     ["solv_area", "pdel"],
                                                     ["solv_area", "iso_point"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
15,SVC(C=1),"['pdel', 'iso_point']",0.9675,0.046837,1.0,0.951493,0.975143,0.209491,[[255 0]\n [ 13 132]]
0,SVC(C=1),"['iso_point', 'pdel']",0.9475,0.063196,1.0,0.922794,0.959847,0.126642,[[251 0]\n [ 21 128]]
5,SVC(C=1),"['iso_point', 'ogAA_letterCode']",0.94,0.076811,1.0,0.915789,0.956044,0.156829,[[261 0]\n [ 24 115]]
20,SVC(C=1),"['solv_area', 'iso_point']",0.9375,0.061998,1.0,0.912587,0.954296,0.176434,[[261 0]\n [ 25 114]]
1,SVC(C=1),"['iso_point', 'solv_area']",0.935,0.069101,1.0,0.905109,0.950192,0.109882,[[248 0]\n [ 26 126]]
3,SVC(C=1),"['iso_point', 'deldel_G2']",0.9325,0.068511,1.0,0.903915,0.949533,0.125987,[[254 0]\n [ 27 119]]
2,SVC(C=1),"['iso_point', 'deldel_G1']",0.9225,0.093508,0.984,0.901099,0.940727,0.116819,[[246 4]\n [ 27 123]]
4,SVC(C=1),"['iso_point', 'deldel_G3']",0.9125,0.087142,0.967742,0.898876,0.932039,0.140146,[[240 8]\n [ 27 125]]
19,SVC(C=1),"['solv_area', 'pdel']",0.88,0.074833,0.935115,0.887681,0.910781,0.179611,[[245 17]\n [ 31 107]]
6,SVC(C=1),"['pdel', 'ogAA_letterCode']",0.8475,0.111775,0.904059,0.875,0.889292,0.168662,[[245 26]\n [ 35 94]]


In [37]:
def SVC_find_best_c(inputList,_max):
    
    outputList = [pd.DataFrame(data={"X Variable(s)": [inputList,],
                                      "Avg Accuracy": 0.0,
                                      "SD of Accuracy": 0.0,
                                      "Sensitivity": 0.0,
                                      "Precision": 0.0,
                                      "F1-Score": 0.0,
                                      "Runtime": 0.0,
                                      "Confusion_Matrix": "none"})] #,"Real Specificity": [1,]
    
    for i in range(1,_max):
        
        tempList = []
        
        for j in range(10):
            tempList.append(ML_summary(inputList,SVC(C=i, kernel='rbf')))
            
        tempList = pd.concat(tempList, ignore_index=True)

        outputList.append(tempList.head(1))

    outputList = pd.concat(outputList, ignore_index=True)
    
    return outputList.sort_values(by = "F1-Score",ascending=False)

SVC_find_best_c(["iso_point","pdel"],10)

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix,Model
2,"['iso_point', 'pdel']",0.95,0.067082,0.992395,0.935484,0.9631,0.110021,[[261 2]\n [ 18 119]],SVC(C=2)
1,"['iso_point', 'pdel']",0.9475,0.049937,1.0,0.923077,0.96,0.120349,[[252 0]\n [ 21 127]],SVC(C=1)
4,"['iso_point', 'pdel']",0.9425,0.083329,0.984436,0.930147,0.956522,0.199236,[[253 4]\n [ 19 124]],SVC(C=4)
5,"['iso_point', 'pdel']",0.94,0.05831,0.992424,0.922535,0.956204,0.18054,[[262 2]\n [ 22 114]],SVC(C=5)
9,"['iso_point', 'pdel']",0.945,0.073993,0.983539,0.929961,0.956,0.192021,[[239 4]\n [ 18 139]],SVC(C=9)
3,"['iso_point', 'pdel']",0.9375,0.065907,0.992248,0.917563,0.953445,0.224562,[[256 2]\n [ 23 119]],SVC(C=3)
6,"['iso_point', 'pdel']",0.9375,0.065907,0.984314,0.922794,0.952562,0.192807,[[251 4]\n [ 21 124]],SVC(C=6)
7,"['iso_point', 'pdel']",0.935,0.069101,0.97561,0.923077,0.948617,0.141654,[[240 6]\n [ 20 134]],SVC(C=7)
8,"['iso_point', 'pdel']",0.9275,0.094835,0.977695,0.91958,0.947748,0.191243,[[263 6]\n [ 23 108]],SVC(C=8)
0,"[iso_point, pdel]",0.0,0.0,0.0,0.0,0.0,0.0,none,


# Everything

In [54]:
ML_compare_variables([SVC(C=1, kernel='rbf'),LinearDiscriminantAnalysis(), neighbors.KNeighborsClassifier(n_neighbors = 3)],[["iso_point", "pdel"],
                                                     ["iso_point", "solv_area"],
                                                     ["iso_point", "deldel_G1"],
                                                     ["iso_point", "deldel_G2"],
                                                     ["iso_point", "deldel_G3"],
                                                     ["iso_point", "ogAA_letterCode"],
                                                     ["pdel", "ogAA_letterCode"],
                                                     ["solv_area", "ogAA_letterCode"],
                                                     ["deldel_G3", "ogAA_letterCode"],
                                                     ["deldel_G2", "ogAA_letterCode"],
                                                     ["deldel_G1", "ogAA_letterCode"],
                                                     ["pdel", "deldel_G1"],
                                                     ["pdel", "deldel_G2"],
                                                     ["pdel", "deldel_G3"],
                                                     ["pdel", "solv_area"],
                                                     ["pdel", "iso_point"],
                                                     ["solv_area", "deldel_G1"],
                                                     ["solv_area", "deldel_G2"],
                                                     ["solv_area", "deldel_G3"],
                                                     ["solv_area", "pdel"],
                                                     ["solv_area", "iso_point"]])

Unnamed: 0,Model,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
47,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'ogAA_letterCode']",0.975,0.048734,1.0,0.964413,0.981884,0.237967,[[271 0]\n [ 10 119]]
1,SVC(C=1),"['iso_point', 'solv_area']",0.96,0.04899,1.0,0.94386,0.971119,0.18836,[[269 0]\n [ 16 115]]
3,SVC(C=1),"['iso_point', 'deldel_G2']",0.96,0.04899,1.0,0.942652,0.97048,0.182837,[[263 0]\n [ 16 121]]
4,SVC(C=1),"['iso_point', 'deldel_G3']",0.955,0.054544,0.985294,0.950355,0.967509,0.167048,[[268 4]\n [ 14 114]]
42,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'pdel']",0.955,0.058949,1.0,0.935484,0.966667,0.25917,[[261 0]\n [ 18 121]]
0,SVC(C=1),"['iso_point', 'pdel']",0.95,0.059161,1.0,0.928315,0.962825,0.168524,[[259 0]\n [ 20 121]]
20,SVC(C=1),"['solv_area', 'iso_point']",0.945,0.063048,1.0,0.921708,0.959259,0.248481,[[259 0]\n [ 22 119]]
46,KNeighborsClassifier(n_neighbors=3),"['iso_point', 'deldel_G3']",0.945,0.063048,1.0,0.921708,0.959259,0.232985,[[259 0]\n [ 22 119]]
2,SVC(C=1),"['iso_point', 'deldel_G1']",0.945,0.054544,0.980989,0.938182,0.959108,0.198216,[[258 5]\n [ 17 120]]
57,KNeighborsClassifier(n_neighbors=3),"['pdel', 'iso_point']",0.9425,0.077096,1.0,0.919298,0.957952,0.23498,[[262 0]\n [ 23 115]]
