In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold 
from sklearn.metrics import roc_curve, classification_report, confusion_matrix
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.metrics import precision_recall_curve, f1_score, roc_auc_score

In [70]:
data = pd.read_csv('model_data.csv')

In [71]:
# This choice comes from cross-validation using R's glmnet libraries.
var_list = ["SMS_received", "VisitNum", "CumNoShow", "Age.Grp", "Gap.Grp"] 
X = data[var_list]

In [72]:
y = data['Outcome']

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=15081947)
X_train, y_train = np.array(X_train), np.array(y_train) 
kf = KFold(n_splits=13, shuffle=True, random_state = 26011950)

In [74]:
accuracy = []
recall = []
precision = []
roc_auc = []
coeffs = []
intercept = []

for train, val in kf.split(X_train, y_train):
    X_trainfold, y_trainfold = X_train[train], y_train[train]
    X_valfold, y_valfold = X_train[val], y_train[val] 
    
    lr = LogisticRegression(solver='saga', penalty='l1')
    
    lr.fit(X_trainfold, y_trainfold)
    y_predict = lr.predict(X_valfold)
    
    # Get the model parameters.
    coeffs.append(lr.coef_)
    intercept.append(lr.intercept_)
    # Get the model performance.
    accuracy.append(accuracy_score(y_valfold, y_predict))
    recall.append(recall_score(y_valfold, y_predict))
    precision.append(precision_score(y_valfold, y_predict))
    roc_auc.append(roc_auc_score(y_valfold,y_predict))

In [75]:
print(f'Training set accuracy  = {round(np.mean(accuracy), 4)}.')
print(f'Training set recall    = {round(np.mean(recall), 4)}.')
print(f'Training set precision = {round(np.mean(precision), 4)}')
print(f'Training set ROC-AUC   = {round(np.mean(roc_auc), 4)}')

Training set accuracy  = 0.9357.
Training set recall    = 0.81.
Training set precision = 0.8629
Training set ROC-AUC   = 0.8887


In [76]:
print(var_list)
print(f'Model coefficients = {np.mean(coeffs, axis=0)}')

['SMS_received', 'VisitNum', 'CumNoShow', 'Age.Grp', 'Gap.Grp']
Model coefficients = [[ 0.3345824  -1.33860047  4.34529029 -0.03214357  0.31404746]]


Recall that <code>No-show == 1</code> means that the patient misses an appointment. The model predicts the probability of this event.

Chances of a patient missing an appointment increase with <code>CumNoShow, Gap.Grp</code>, they improve with <code>VisitNum</code> and <code>Age.Grp</code>. <code>CumNoShow</code> and <code>Gap.Grp</code> are the strongest predictor of a person not showing up. On the other hand, a patient tends **not** to miss an appointment if he/she has visited the hospital several times before. Older patients too tend not to miss an appointment.

In [77]:
print(f'Model intercept = {np.mean(intercept, axis=0)}')

Model intercept = [-1.45611994]


In [78]:
y_predict = lr.predict(X_test)

In [79]:
print(f'Test set accuracy  = {round(accuracy_score(y_test, y_predict), 4)}')
print(f'Test set recall    = {round(recall_score(y_test, y_predict), 4)}')
print(f'Test set precision = {round(precision_score(y_test, y_predict), 4)}')
print(f'Test set ROC-AUC   = {round(roc_auc_score(y_test,y_predict), 4)}')

Test set accuracy  = 0.9348
Test set recall    = 0.8181
Test set precision = 0.8564
Test set ROC-AUC   = 0.8914


In [80]:
labels = np.unique(y_test)
cm = pd.DataFrame(confusion_matrix(y_predict, y_test), index=labels, columns=labels)
cm # Prediction along y-axis, actual along x-axis

Unnamed: 0,0,1
0,8483,411
1,310,1849
