Hello Ben,

I enjoyed solving the problem and I want to utilise the 24 hours you've given me. Kindly view this notebook as a supplement to the main submission. I start with the cleaned version of the data.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

D = pd.read_csv('clean_diabetes.csv')

In [8]:
D.describe()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,383.5,3.785156,120.946615,69.140625,20.94401,77.072917,31.980469,0.467483,33.769531,0.348958
std,221.846794,3.291875,30.990784,19.068969,15.527097,110.316623,7.735926,0.323716,15.626516,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078,-1.0,0.0
25%,191.75,1.0,100.0,64.0,0.0,0.0,27.5,0.251,24.0,0.0
50%,383.5,3.0,117.0,72.0,23.0,36.0,32.0,0.3725,29.0,0.0
75%,575.25,6.0,139.0,80.0,32.0,122.0,36.225,0.602,40.0,1.0
max,767.0,17.0,198.0,122.0,99.0,846.0,67.1,2.42,200.0,1.0


In [9]:
D = D.drop(columns=['Unnamed: 0'])
D = D.loc[D['Insulin'] < 400] # Remove outliers.
D.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,3.796,120.124,69.016,20.616,66.373333,31.8916,0.466344,33.782667,0.342667
std,3.29457,30.675429,19.213987,15.493976,85.567449,7.740028,0.32041,15.686157,0.474918
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,-1.0,0.0
25%,1.0,100.0,64.0,0.0,0.0,27.5,0.251,24.0,0.0
50%,3.0,117.0,72.0,23.0,36.0,32.0,0.3725,29.0,0.0
75%,6.0,138.0,78.0,32.0,115.0,36.1,0.6,40.0,1.0
max,17.0,198.0,122.0,99.0,392.0,67.1,2.42,200.0,1.0


We redefine helped functions. It would have been nice if I could have imported them as a module!!

In [25]:
def trn_tst_split(data, trn_pct):
    """Split the data set into training and test.
    
        data: the data frame to be split.
        trn_pct: % of training data. Must be between 0 and 1.
        
        Returns a list of two data frames - training and test.
    """
    random.seed(12111842)
    N = data.shape[0]
    n_train = int(N * trn_pct)
    trn_indices = set(random.sample(range(N), n_train))
    tst_indices = set(range(N)) - trn_indices
    
    assert len(trn_indices.intersection(tst_indices)) == 0
    assert len(trn_indices) + len(tst_indices) == N
    
    # Convert them to lists.
    trn_indices = [i for i in trn_indices]
    tst_indices = [i for i in tst_indices]
    
    trn_data = data.iloc[trn_indices, :]
    tst_data = data.iloc[tst_indices, :]
    
    assert trn_data.shape[0] + tst_data.shape[0] == data.shape[0]
    
    return [trn_data, tst_data]


def print_cm_results(cm):
    """Print diagnostics from the confusion matrix."""
    recall = cm[0, 0]/(cm[0, 0] + cm[1, 0])
    precision = cm[0, 0]/(cm[0, 0] + cm[0, 1])
    specificity = cm[1, 1]/(cm[1, 0] + cm[1, 1])
    f1_score = 2*recall*precision/(recall + precision)
    accuracy = np.trace(cm)/np.sum(cm)
    
    print(f'% +ves correctly predicted (recall) = {round(recall * 100, 2)}')
    print(f'% +ves detected out of all (precision) = {round(precision * 100, 2)}')
    print(f'% -ves detected out of all (specificity) = {round(specificity * 100, 2)}')
    print(f'f1 score = {round(f1_score, 2)}')
    print(f'accuracy = {round(accuracy * 100, 2)}')
    
def get_model_matrices(D, formula):
    return dmatrices(formula, data=D, return_type='dataframe')

def prob_to_outcome(y, threshold = 0.5):
    """Converts probability to a class depending on the threshold."""
    if y < threshold:
        return 0
    else:
        return 1
    
def build_cm(yt, yp):
    """Generates confusion matrix from actual and predicted values."""
    yt['predicted'] = [prob_to_outcome(y) for y in yp]
    yt.columns = ['actual', 'predicted']
    yt['actual'] = yt['actual'].apply(lambda f: int(f))
    cm = pd.crosstab(yt['actual'], yt['predicted']).to_numpy()
    
    return cm    

Perhaps there is a library in Python to carry out cross-validation. But I don't yet know about it. Therefore, I implemented a few functions to carry out 30-fold cross validation. We have 750 observations. I build a model using 725 of them and use 25 to test. I choose the 25 observations to test sequentially. In the first iteration rows 0-24 are test rows, in the second one 25-49 are test rows, and so on. In each iteration, I compute the accuracy of the model and the model coefficients. I save them in two lists.

Finally, I will take an average of the model coefficients.


In [34]:
boundaries = np.linspace(0, D.shape[0], 26)
all_params = []
all_accuracies = []

for i in range(len(boundaries) - 1):
    l = int(boundaries[i])
    r = int(boundaries[i+1])
    print(f'{l}, {r}')
    D_test = D.iloc[l:r, :]
    D_train = pd.concat([D.iloc[0:l, :], D.iloc[r:D.shape[0], :]], axis=0)
    yt, Xt = get_model_matrices(D_test, f_v2)
    yn, Xn = get_model_matrices(D_train, f_v2)
    model = sm.Logit(yn, Xn)
    results = model.fit()
    yp = results.predict(Xt)
    cm = build_cm(yt, yp)
    
    all_accuracies.append(np.trace(cm)/np.sum(cm))
    all_params.append(results.params)
    

0, 30
Optimization terminated successfully.
         Current function value: 0.470162
         Iterations 6
30, 60
Optimization terminated successfully.
         Current function value: 0.479003
         Iterations 6
60, 90
Optimization terminated successfully.
         Current function value: 0.484953
         Iterations 6
90, 120
Optimization terminated successfully.
         Current function value: 0.483766
         Iterations 6
120, 150
Optimization terminated successfully.
         Current function value: 0.479860
         Iterations 6
150, 180
Optimization terminated successfully.
         Current function value: 0.485337
         Iterations 6
180, 210
Optimization terminated successfully.
         Current function value: 0.478708
         Iterations 6
210, 240
Optimization terminated successfully.
         Current function value: 0.476786
         Iterations 6
240, 270
Optimization terminated successfully.
         Current function value: 0.477413
         Iterations 6
270, 300


In [36]:
print([round(a, 2) for a in all_accuracies])

[0.6, 0.7, 0.87, 0.83, 0.77, 0.9, 0.7, 0.67, 0.7, 0.67, 0.7, 0.8, 0.77, 0.8, 0.77, 0.7, 0.87, 0.77, 0.8, 0.93, 0.77, 0.63, 0.77, 0.77, 0.77]


In [39]:
all_params_df = pd.concat(all_params, axis=1).reset_index()

In [40]:
all_params_df

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,24
0,Intercept,-8.904226,-8.743452,-8.459655,-8.458774,-8.554322,-8.486079,-8.615895,-8.696815,-8.685007,...,-8.813801,-8.571501,-8.508374,-8.425819,-8.277207,-8.644413,-8.794412,-8.755106,-8.539364,-8.544492
1,Pregnancies,0.132456,0.131429,0.132501,0.131689,0.131788,0.126393,0.127783,0.126877,0.128656,...,0.135763,0.127942,0.119695,0.142992,0.127544,0.122811,0.139749,0.125484,0.125112,0.143938
2,Glucose,0.035006,0.036191,0.034707,0.034253,0.035055,0.034378,0.034825,0.035551,0.035793,...,0.036372,0.036113,0.035323,0.033959,0.033001,0.035289,0.034824,0.03384,0.035002,0.033835
3,BMI,0.087112,0.079521,0.077771,0.07856,0.079265,0.080717,0.082737,0.080584,0.080995,...,0.0816,0.076312,0.078556,0.078875,0.078307,0.081696,0.086256,0.088747,0.079983,0.081103
4,DiabetesPedigreeFunction,1.034731,1.064587,0.933277,0.981074,0.92896,0.911971,0.871549,1.012282,0.924514,...,0.992899,0.955082,0.95591,0.939139,0.982489,0.981492,0.954487,1.050669,0.926435,0.966304


In [62]:
from scipy.stats import sem, t

def get_mean_with_ci(X):
    m = np.mean(X)
    conf = 0.95
    N = len(X)
    h = sem(X) * t.ppf((1 + conf)/2, N - 1)
    
    return (m, m - h, m + h)

The final results of the exercise are the outputs of the two cells below. In particular, when we put the model in production, we will use the mean values of the coefficients printed in the last cell. The 95% confidence interval of the model's accuracy is printed in the cell below.

In [77]:
(m, l, r) = get_mean_with_ci(all_accuracies)
print(f'Accuracies mean = {round(m, 2)}, C.I. = [{round(l, 2)}, {round(r, 2)}]')

Accuracies mean = 0.76, C.I. = [0.73, 0.79]


In [74]:
all_intervals = []
all_rows = []

for i in range(all_params_df.shape[0]):
    row = all_params_df.iloc[i, :].to_numpy()
    name = row[0]
    all_rows.append(row)
    info = get_mean_with_ci(row[1:len(row)])
    print(f'{name}: coeff-mean = {round(info[0], 3)}, C.I. = [{round(info[1], 3)}, {round(info[2], 3)}]')    

Intercept: coeff-mean = -8.59, C.I. = [-8.651, -8.529]
Pregnancies: coeff-mean = 0.132, C.I. = [0.129, 0.134]
Glucose: coeff-mean = 0.035, C.I. = [0.034, 0.035]
BMI: coeff-mean = 0.081, C.I. = [0.079, 0.082]
DiabetesPedigreeFunction: coeff-mean = 0.972, C.I. = [0.946, 0.997]
