In [None]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_selection import f_classif
from sklearn.model_selection import KFold
from itertools import compress
import random
from collections import Counter

In [None]:
file_Name1 = "C:/Users/Charity Faith/Desktop/Data/_02_xVar"
fileObject1 = open(file_Name1,'rb') 
xVar = pickle.load(fileObject1)   
fileObject1.close()

yVar = 'Outcome'

file_Name2 = "C:/Users/Charity Faith/Desktop/Data/_02_subsetDF_some"
fileObject2 = open(file_Name2,'rb') 
subsetDF_some = pickle.load(fileObject2)   
fileObject2.close()

subsetDF_some.head()

In [None]:
'''
NAME: chooseParam
DESCRIPTION: choose significant parameters, based on ANOVA 
    F-test (are the means of each group equal? are the beta values = 0?)
INPUT: dataframe, alpha, dependent variables, independent variable
OUTPUT: list of significant dependent variables
'''
def chooseParam(df_xVar, df_yVar, alpha=0.05):
    scores, pvalues = f_classif(df_xVar, df_yVar)
    temp_xVar = [p<alpha for p in pvalues]
    sig_xVar = list(compress(xVar, temp_xVar))
    return(sig_xVar)

### 10-FOLD CROSS-VALIDATION ###
### REPEAT 20 TIMES ###
'''
NAME: kFoldCV
DESCRIPTION: k-Fold Cross-Validation
INPUT: dataframe, dependent variables, independent variable, k
OUTPUT: list of confusion matrix and summary of measurements
'''
def kFoldCV(df, xVar, yVar, alpha, k=10):
    random.seed(1231) 
    confMatrix = np.array([[0,0],[0,0]])
    #betaVals = []
    xVar_count = Counter()
    
    kf = KFold(n_splits=k)
    for train_index, test_index in kf.split(df):
        xVar_train, xVar_test = df[xVar].iloc[train_index], df[xVar].iloc[test_index]
        yVar_train, yVar_test = df[yVar].iloc[train_index], df[yVar].iloc[test_index]
        
        sig_xVar = chooseParam(xVar_train, yVar_train, alpha=alpha)
        
        for var in sig_xVar:
            if var in xVar_count:
                xVar_count[var] += 1
            else:
                xVar_count[var] = 1
                
        model = LogisticRegression(C=1e9)
        model = model.fit(xVar_train[sig_xVar], yVar_train)
        
        #coefficient, intercept-first
        #temp_betaVals = [model.coef_]
        #betaVals += temp_betaVals
        testPred = model.predict(xVar_test[sig_xVar])
        
        temp_confMatrix = metrics.confusion_matrix(yVar_test, testPred)
        confMatrix += temp_confMatrix    
    
    TP, FN, FP, TN = confMatrix[0,0], confMatrix[0,1], confMatrix[1,0], confMatrix[1,1]
    N = len(subsetDF_some)

    summary = np.array([(FP+FN)/N, (TP+TN)/N, TP/(TP+FN), TN/(TN+FP), TP/(TP+FP), FP/(TN+FP)])
    return[confMatrix, summary, xVar_count]

'''
NAME: repeatkFoldCV
DESCRIPTION: repeat k-Fold Cross-Validation n times
INPUT: dataframe, dependent variables, independent variable, k, n
OUTPUT: list of average confusion matrix and average summary of measurements
'''
def repeatkFoldCV(df, xVar, yVar, alpha, k=10, n=20):
    temp_confMatrix = np.array([[0,0],[0,0]])
    temp_summary = np.array([0.0,0.0,0.0,0.0,0.0,0.0])
    finalCounter = Counter()
    
    for i in range(n):
        random.seed(n)
        tempRun = kFoldCV(df, xVar, yVar, alpha, k)
        temp_confMatrix += tempRun[0]
        temp_summary += tempRun[1]
        finalCounter += tempRun[2]
        
    avg_confMatrix = temp_confMatrix/n
    avg_summary = temp_summary/n
    avg_summary = pd.DataFrame(avg_summary)
    avg_summary.index = ['Misclassification Rate', 'Accuracy Rate', 'Sensitivity Rate', 
                         'Specificity Rate','Precision', 'False Positive Rate']
    
    return(avg_confMatrix, avg_summary, finalCounter)

In [None]:
'''
NAME: plotBestCount
DESCRIPTION: show plot of variable count vs. (test) accuracy rate
INPUT: dataframe, dependent variables, independent variable, k, n
OUTPUT: NA
'''
def plotBestCount(df, xVar, yVar, k=10, n=20):
    # Q: should i do this by alpha level or by variable count?
    alphaLevels = [0.00001, 0.0001, 0.001, 0.005, 0.01, 0.05]
    xCount = []
    yAccuracy = []
    for a in alphaLevels:
        result = repeatkFoldCV(df, xVar, yVar, alpha=a, k=k, n=n)
        tempCount = sum(1 for num in result[2].values() if num==result[2][max(result[2])])
        xCount.append(tempCount)
        tempAccuracy = result[1].loc['Accuracy Rate']
        yAccuracy.append(tempAccuracy)
    plt.plot(xCount, yAccuracy, 'ro')
    plt.title('Variable Selection')
    plt.xlabel('Number of Variables Included in the Model')
    plt.ylabel('(Test) Accuracy Rate')
    plt.show()

plotBestCount(subsetDF_some, xVar, yVar, k=10, n=20)