In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold 
from sklearn.metrics import roc_curve, classification_report, confusion_matrix
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.metrics import precision_recall_curve, f1_score, roc_auc_score

In [2]:
data = pd.read_csv('model_data.csv')

In [20]:
# This choice comes from cross-validation using R's glmnet libraries.
X = data[["SMS_received", "VisitNum", "CumNoShow", "Age.Grp", "Gap.Grp"]]

In [21]:
y = data['Outcome']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=15081947)
X_train, y_train = np.array(X_train), np.array(y_train) 
kf = KFold(n_splits=13, shuffle=True, random_state = 26011950)

In [52]:
accuracy = []
recall = []
precision = []
roc_auc = []
coeffs = []
intercept = []

for train, val in kf.split(X, y):
    X_trainfold, y_trainfold = X_train[train], y_train[train]
    X_valfold, y_valfold = X_train[val], y_train[val] 
    
    lr = LogisticRegression(solver='saga', penalty='l1')
    
    lr.fit(X_trainfold, y_trainfold)
    y_predict = lr.predict(X_valfold)
    
    # Get the model parameters.
    coeffs.append(reg.coef_)
    intercept.append(reg.intercept_)
    # Get the model performance.
    accuracy.append(accuracy_score(y_valfold, y_predict))
    recall.append(recall_score(y_valfold, y_predict))
    precision.append(precision_score(y_valfold, y_predict))
    roc_auc.append(roc_auc_score(y_valfold,y_predict))

In [60]:
print(f'Training set accuracy = {np.mean(accuracy)}.')
print(f'Training set recall = {np.mean(recall)}.')
print(f'Training set precision = {np.mean(precision)}')
print(f'Training set ROC-AUC = {np.mean(roc_auc)}')

Training set accuracy = 0.9358387964679834.
Training set recall = 0.8129856532279033.
Training set precision = 0.8615509443083692
Training set ROC-AUC = 0.8899606376342433


In [64]:
print('["SMS_received", "VisitNum", "CumNoShow", "Age.Grp", "Gap.Grp"]')
print(f'Model coefficients = {np.mean(coeffs, axis=0)}')

["SMS_received", "VisitNum", "CumNoShow", "Age.Grp", "Gap.Grp"]
Model coefficients = [[ 0.35307603 -1.3291846   4.3236461  -0.0299768   0.30989835]]


Recall that <code>No-show == 1</code> means that the patient misses an appointment. The model predicts the probability of this event.

Chances of a patient missing an appointment increase with <code>SMS_received, CumNoShow, Gap.Grp</code>, they improve with <code>VisitNum</code> and <code>Age.Grp</code>. <code>CumNoShow</code> and <code>Gap.Grp</code> are the strongest predictor of a person not showing up. On the other hand, a patient tends **not** to miss an appointment if he/she has visited the hospital several times before. Older patients too tend not to miss an appointment.

In [62]:
print(f'Model intercept = {np.mean(intercept, axis=0)}')

Model intercept = [-1.45915179]


In [56]:
y_predict = lr.predict(X_test)

In [63]:
print(f'Test set accuracy = {accuracy_score(y_test, y_predict)}')
print(f'Test set recall = {recall_score(y_test, y_predict)}')
print(f'Test set precision = {precision_score(y_test, y_predict)}')
print(f'Test set ROC-AUC = {roc_auc_score(y_test,y_predict)}')

Test set accuracy = 0.9341355288157062
Test set recall = 0.8132743362831858
Test set precision = 0.8572761194029851
Test set ROC-AUC = 0.8892369634333023
