Hello Ben,

I enjoyed solving the problem and I want to utilise the 24 hours you've given me. Kindly view this notebook as a supplement to the main submission. I start with the cleaned version of the data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

D = pd.read_csv('clean_diabetes.csv')

In [None]:
D.describe()

In [None]:
D = D.drop(columns=['Unnamed: 0'])
D = D.loc[D['Insulin'] < 400] # Remove outliers.
D.describe()

We redefine helped functions. It would have been nice if I could have imported them as a module!!

In [None]:
def trn_tst_split(data, trn_pct):
    """Split the data set into training and test.
    
        data: the data frame to be split.
        trn_pct: % of training data. Must be between 0 and 1.
        
        Returns a list of two data frames - training and test.
    """
    random.seed(12111842)
    N = data.shape[0]
    n_train = int(N * trn_pct)
    trn_indices = set(random.sample(range(N), n_train))
    tst_indices = set(range(N)) - trn_indices
    
    assert len(trn_indices.intersection(tst_indices)) == 0
    assert len(trn_indices) + len(tst_indices) == N
    
    # Convert them to lists.
    trn_indices = [i for i in trn_indices]
    tst_indices = [i for i in tst_indices]
    
    trn_data = data.iloc[trn_indices, :]
    tst_data = data.iloc[tst_indices, :]
    
    assert trn_data.shape[0] + tst_data.shape[0] == data.shape[0]
    
    return [trn_data, tst_data]


def print_cm_results(cm):
    """Print diagnostics from the confusion matrix."""
    recall = cm[0, 0]/(cm[0, 0] + cm[1, 0])
    precision = cm[0, 0]/(cm[0, 0] + cm[0, 1])
    specificity = cm[1, 1]/(cm[1, 0] + cm[1, 1])
    f1_score = 2*recall*precision/(recall + precision)
    accuracy = np.trace(cm)/np.sum(cm)
    
    print(f'% +ves correctly predicted (recall) = {round(recall * 100, 2)}')
    print(f'% +ves detected out of all (precision) = {round(precision * 100, 2)}')
    print(f'% -ves detected out of all (specificity) = {round(specificity * 100, 2)}')
    print(f'f1 score = {round(f1_score, 2)}')
    print(f'accuracy = {round(accuracy * 100, 2)}')
    
def get_model_matrices(D, formula):
    return dmatrices(formula, data=D, return_type='dataframe')

def prob_to_outcome(y, threshold = 0.5):
    """Converts probability to a class depending on the threshold."""
    if y < threshold:
        return 0
    else:
        return 1
    
def build_cm(yt, yp):
    """Generates confusion matrix from actual and predicted values."""
    yt['predicted'] = [prob_to_outcome(y) for y in yp]
    yt.columns = ['actual', 'predicted']
    yt['actual'] = yt['actual'].apply(lambda f: int(f))
    cm = pd.crosstab(yt['actual'], yt['predicted']).to_numpy()
    
    return cm    

Perhaps there is a library in Python to carry out cross-validation. But I don't yet know about it. Therefore, I implemented a few functions to carry out 30-fold cross validation. We have 750 observations. I build a model using 725 of them and use 25 to test. I choose the 25 observations to test sequentially. In the first iteration rows 0-24 are test rows, in the second one 25-49 are test rows, and so on. In each iteration, I compute the accuracy of the model and the model coefficients. I save them in two lists.

Finally, I will take an average of the model coefficients.


In [None]:
boundaries = np.linspace(0, D.shape[0], 26)
all_params = []
all_accuracies = []

for i in range(len(boundaries) - 1):
    l = int(boundaries[i])
    r = int(boundaries[i+1])
    print(f'{l}, {r}')
    D_test = D.iloc[l:r, :]
    D_train = pd.concat([D.iloc[0:l, :], D.iloc[r:D.shape[0], :]], axis=0)
    yt, Xt = get_model_matrices(D_test, f_v2)
    yn, Xn = get_model_matrices(D_train, f_v2)
    model = sm.Logit(yn, Xn)
    results = model.fit()
    yp = results.predict(Xt)
    cm = build_cm(yt, yp)
    
    all_accuracies.append(np.trace(cm)/np.sum(cm))
    all_params.append(results.params)
    

In [None]:
print([round(a, 2) for a in all_accuracies])

In [None]:
all_params_df = pd.concat(all_params, axis=1).reset_index()

In [None]:
all_params_df

In [None]:
from scipy.stats import sem, t

def get_mean_with_ci(X):
    m = np.mean(X)
    conf = 0.95
    N = len(X)
    h = sem(X) * t.ppf((1 + conf)/2, N - 1)
    
    return (m, m - h, m + h)

The final results of the exercise are the outputs of the two cells below. In particular, when we put the model in production, we will use the mean values of the coefficients printed in the last cell. The 95% confidence interval of the model's accuracy is printed in the cell below.

In [None]:
(m, l, r) = get_mean_with_ci(all_accuracies)
print(f'Accuracies mean = {round(m, 2)}, C.I. = [{round(l, 2)}, {round(r, 2)}]')

In [None]:
all_intervals = []
all_rows = []

for i in range(all_params_df.shape[0]):
    row = all_params_df.iloc[i, :].to_numpy()
    name = row[0]
    all_rows.append(row)
    info = get_mean_with_ci(row[1:len(row)])
    print(f'{name}: coeff-mean = {round(info[0], 3)}, C.I. = [{round(info[1], 3)}, {round(info[2], 3)}]')    