In [184]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score,precision_score,recall_score
from sklearn.model_selection import KFold, LeaveOneOut,cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from time import time
import timeit #imports timeit module
from sklearn import preprocessing

import random
random.seed(44)

In [186]:
df = pd.read_csv('Cataract Data 1.3.csv', na_values='?').dropna()
# df = pd.read_csv('Cataract Data 1.3.csv', index_col=0, parse_dates=True)
print(df.head())

pd.set_option('display.max_rows', None)

  index_name ogAA_letter  positionAA_num mutAA_letter  iso_point ogAA_char  \
0      A171T           A             171            T       6.76         n   
1      D140N           D             140            N       7.15         -   
2      D109H           D             109            H       7.18         -   
3      D109A           D             109            A       7.15         -   
4       P20S           P              20            S       6.76         P   

  mutAA_char  deldel_G1  solv_area  deldel_G2  deldel_G3  evo_age  pdel  \
0          l      -0.47       85.3  -0.550085  -0.005249      324  0.50   
1          l      -0.96       59.0  -1.864259  -0.180476      750  0.74   
2        H,+      -0.42       65.2  -0.661944  -0.509662      750  0.74   
3          n      -0.11       65.2  -0.550471  -0.670734      750  0.74   
4          l      -0.78       35.9  -0.703610  -0.456211      750  0.74   

  positive_negative  
0          positive  
1          positive  
2          pos

In [187]:
def ML_logistic_regression_summaries(Vars):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 
    
    #set the model
    model = LogisticRegression(solver= 'liblinear', class_weight = 'balanced')

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #fit the model
        model.fit(X_train,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    return dfOutput

    

In [188]:
ML_logistic_regression_summaries(["pdel","iso_point"])

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,"[pdel, iso_point]",0.85,0.161245,0.861423,0.909091,0.884615,0.112095,[[230 37]\n [ 23 110]]


In [189]:
def compare_variables_logistic(inputList):
    outputList = []

    for i in inputList:
        outputList.append(ML_logistic_regression_summaries(i))

    outputList = pd.concat(outputList)

    return outputList

compare_variables_logistic([["pdel","iso_point"], ["pdel","deldel_G3"], ["iso_point", "deldel_G3"]])

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,"[pdel, iso_point]",0.8475,0.13036,0.844106,0.917355,0.879208,0.132654,[[222 41]\n [ 20 117]]
0,"[pdel, deldel_G3]",0.64,0.13,0.54321,0.8,0.647059,0.11246,[[132 111]\n [ 33 124]]
0,"[iso_point, deldel_G3]",0.505,0.154839,0.593023,0.621951,0.607143,0.132087,[[153 105]\n [ 93 49]]


In [199]:
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler

def ML_KNN_summaries(Vars):
    
    #define X and y variables
    X = df.loc[:, Vars] 
    y = df.loc[:, "positive_negative"] 
    
    #standardize X --> Elaine doesn't think this is the right place to do so, so check the for loop
#     scaler = preprocessing.StandardScaler().fit(X_train)
#     X_scaled = scaler.transform(X_train)
#     print(X_scaled)    
    
    #set the model
    model = neighbors.KNeighborsClassifier(n_neighbors = 3)

    #set the CV
    kf = LeaveOneOut()
    
    #start timer for getting elapsed time
    from time import time
    import timeit #imports timeit module
    start_time = timeit.default_timer() #defines start time so computational time can be calculated

    #define lists
    acc_score = [];
    Truth = [];
    Output = [];

    #loop though each fold (so 40 times in our case)
    for train_index , test_index in kf.split(df):
        
        #split the data
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .25, shuffle=True)

        #print(X_train); print(X_test); print(y_train); print(y_test)
        
        #standardizing the X (you change line 44 to model.fit(X_scaled, y_train))
#         scaler = preprocessing.StandardScaler().fit(X_train)
#         X_scaled = scaler.transform(X_train)
        
        #fit the model
        model.fit(X,y_train)
        
        #preduct values
        pred_values = model.predict(X_test)

        #append the accuracy score
        acc = accuracy_score(pred_values, y_test)
        acc_score.append(acc)

        #add to the truth and output
        Truth.extend(y_test.values.reshape(y_test.shape[0])); ## it is a list
        Output.extend(pred_values); ## it is a list  
        

 
    #determine the time elapesed
    elapsed = timeit.default_timer() - start_time #gives total computation time

    #return the dataframe
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    
    #set output to a dataframe
    dfOutput = pd.DataFrame(data={"X Variable(s)": [Vars,],
                                  "Avg Accuracy": [np.mean(acc_score),],
                                  "SD of Accuracy": [np.std(acc_score),],
                                  "Sensitivity": [recall_score(Truth,Output,pos_label="negative"),],
                                  "Precision": [precision_score(Truth,Output,pos_label="negative"),],
                                  "F1-Score": [f1_score(Truth,Output,pos_label="negative"),],
                                  "Runtime": [elapsed,],
                                  "Confusion_Matrix": str(confusion_matrix(Truth,Output))}) #,"Real Specificity": [1,]
    #return output
    #test = pd.DataFrame(data={"Predicted:": Output, "Real": Truth})
    return dfOutput

ML_KNN_summaries(["pdel","iso_point"])

Unnamed: 0,X Variable(s),Avg Accuracy,SD of Accuracy,Sensitivity,Precision,F1-Score,Runtime,Confusion_Matrix
0,"[pdel, iso_point]",0.955,0.058949,1.0,0.935714,0.96679,0.430984,[[262 0]\n [ 18 120]]
