In [1]:
#import libraries
import pandas as pd

import imblearn
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
#import data
df = pd.read_csv('med_appt_encoded.csv')
df.sample(5)

Unnamed: 0,patientID,apptID,gender,sched_date,appt_date,age,neighborhood,scholarship,hypertension,diabetes,...,sms_received,noshow,waiting,sched_day,appt_day,sched_weekday,appt_weekday,weather_cloudy,weather_partly sunny,weather_sunny
69788,15392583116467,5712280,1,2016-05-18,2016-05-18,69,2.0,0,1,0,...,0,0,0,2,2,1,1,0,0,1
50576,835193285199876,5619886,0,2016-04-26,2016-05-05,51,5.2,1,0,1,...,1,0,9,1,3,1,1,0,0,1
3912,372245268797748,5680566,1,2016-05-10,2016-05-11,6,2.0,0,0,0,...,0,0,1,1,2,1,1,0,0,1
8049,671452879114991,5714022,0,2016-05-18,2016-05-24,39,3.8,0,0,0,...,1,0,6,2,1,1,1,1,0,0
9139,631652118625267,5704659,1,2016-05-16,2016-05-24,30,3.9,0,0,0,...,0,0,8,0,1,1,1,1,0,0


In [3]:
# delete unnecessary columns
df.drop(columns=['patientID', 'apptID', 'sched_date', 'appt_date'], inplace=True)

In [4]:
# check data correlation to no-show
df.corr()['noshow'].sort_values(ascending=False)[1:]

waiting                 0.234944
sms_received            0.134966
scholarship             0.029709
sched_weekday           0.007205
weather_partly sunny    0.007187
appt_day                0.006144
sched_day               0.001425
alcoholism              0.000109
appt_weekday           -0.000188
neighborhood           -0.001088
weather_cloudy         -0.001214
weather_sunny          -0.003628
handicap               -0.005735
gender                 -0.007588
diabetes               -0.011860
hypertension           -0.032342
age                    -0.050129
Name: noshow, dtype: float64

In [5]:
# drop columns with low correlation
df.drop(columns=['appt_weekday', 'alcoholism', 'sched_day', 'neighborhood', 'appt_day', 'weather_cloudy', 'weather_sunny', 'handicap'], inplace=True)

In [6]:
# split data

X = df.drop(columns='noshow')
y = df['noshow']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=42)

In [7]:
# imbalance handling
sm = SMOTE()
X_sm, y_sm = sm.fit_sample(X_train, y_train)

# Logistic Regression

### Base Model

In [8]:
# building model
LR = LogisticRegression()
LR.fit(X_sm, y_sm)
y_pred_LR = LR.predict(X_test)

In [10]:
# metrics
acc_LR = accuracy_score(y_test, y_pred_LR)
pre_LR = precision_score(y_test, y_pred_LR)
rec_LR = recall_score(y_test, y_pred_LR)
f1_LR = f1_score(y_test, y_pred_LR)

In [11]:
# classification report
print(classification_report(y_test, y_pred_LR))

              precision    recall  f1-score   support

           0       0.96      0.71      0.82     17981
           1       0.11      0.57      0.18      1080

    accuracy                           0.70     19061
   macro avg       0.54      0.64      0.50     19061
weighted avg       0.92      0.70      0.78     19061



In [12]:
# actual-prediction table
cm_LR = confusion_matrix(y_test, y_pred_LR, labels = [1,0])
df_LR = pd.DataFrame(data = cm_LR , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,617,463
Act 0,5227,12754


### Hyperparameter Tuning

In [13]:
LR_tuning = LogisticRegression(random_state=42)
para_LR = {'C': [1, 0.01, 0.015,  0.1, 0.5, 1.2, 2],
           'penalty': ['l2', 'l1', 'elasticnet'], }
LR_tuning = GridSearchCV(estimator=LR_tuning, param_grid=para_LR, cv=3, n_jobs=-1 , verbose=1, scoring = 'recall')

In [14]:
LR_tuning.fit(X_sm, y_sm)
LR_tuned = LR_tuning.best_estimator_
y_pred_LR_tuned = LR_tuned.predict(X_test)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    4.4s finished


In [15]:
LR_tuning.best_params_

{'C': 1.2, 'penalty': 'l2'}

In [16]:
# metrics
acc_LR_tuned = accuracy_score(y_test, y_pred_LR_tuned)
pre_LR_tuned = precision_score(y_test, y_pred_LR_tuned)
rec_LR_tuned = recall_score(y_test, y_pred_LR_tuned)
f1_LR_tuned = f1_score(y_test, y_pred_LR_tuned)

In [17]:
# classification report
print(classification_report(y_test, y_pred_LR_tuned))

              precision    recall  f1-score   support

           0       0.96      0.71      0.82     17981
           1       0.11      0.57      0.18      1080

    accuracy                           0.70     19061
   macro avg       0.54      0.64      0.50     19061
weighted avg       0.92      0.70      0.78     19061



In [18]:
# actual-prediction table
cm_LR_tuned = confusion_matrix(y_test, y_pred_LR_tuned, labels = [1,0])
df_LR_tuned = pd.DataFrame(data = cm_LR_tuned, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,617,463
Act 0,5227,12754


# K-Nearest Neighbors

### Base Model

In [39]:
from sklearn.neighbors import KNeighborsClassifier

In [40]:
KNN = KNeighborsClassifier()
KNN.fit(X_sm, y_sm)
y_pred_KNN = KNN.predict(X_test)

In [41]:
acc_KNN = accuracy_score(y_test, y_pred_KNN)
pre_KNN = precision_score(y_test, y_pred_KNN)
rec_KNN = recall_score(y_test, y_pred_KNN)
f1_KNN = f1_score(y_test, y_pred_KNN)

In [42]:
print(classification_report(y_test, y_pred_KNN))

              precision    recall  f1-score   support

           0       0.87      0.67      0.75     15427
           1       0.28      0.56      0.38      3634

    accuracy                           0.65     19061
   macro avg       0.57      0.61      0.57     19061
weighted avg       0.75      0.65      0.68     19061



In [43]:
cm_KNN = confusion_matrix(y_test, y_pred_KNN, labels = [1,0])
df_KNN = pd.DataFrame(data = cm_KNN, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN

Unnamed: 0,Pred 1,Pred 0
Act 1,2024,1610
Act 0,5106,10321


### Hyperparameter Tuning

In [83]:
KNN_tuning = KNeighborsClassifier()
param_KNN = {'n_neighbors': [5, 1, 10, 20],
             'weights': ['uniform', 'distance'], 
             'leaf_size': [30, 10, 50, 70],
             'p': [2,1]}
KNN_tuning = GridSearchCV(estimator=KNN_tuning, param_grid=param_KNN, cv=3, n_jobs=-1 , verbose=1, scoring='recall')

In [84]:
KNN_tuning.fit(X_sm, y_sm)
KNN_tuned = model_KNN_tuned.best_estimator_
y_pred_KNN_tuned = KNN_tuned.predict(X_test)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  1.6min finished


In [85]:
KNN_tuning.best_params_

{'leaf_size': 50, 'n_neighbors': 20, 'p': 2, 'weights': 'uniform'}

In [86]:
acc_KNN_tuned = accuracy_score(y_test, y_pred_KNN_tuned)
rec_KNN_tuned = recall_score(y_test, y_pred_KNN_tuned)
pre_KNN_tuned = precision_score(y_test, y_pred_KNN_tuned)
f1_KNN_tuned = f1_score(y_test, y_pred_KNN_tuned)

In [87]:
print(classification_report(y_test, y_pred_KNN_tuned))

              precision    recall  f1-score   support

           0       0.88      0.65      0.75     15427
           1       0.30      0.63      0.41      3634

    accuracy                           0.65     19061
   macro avg       0.59      0.64      0.58     19061
weighted avg       0.77      0.65      0.68     19061



In [88]:
cm_KNN_tuned = confusion_matrix(y_test, y_pred_KNN_tuned, labels = [1,0])
df_KNN_tuned = pd.DataFrame(data = cm_KNN_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,2301,1333
Act 0,5392,10035


# Random Forest

### Base Model

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
RFC = RandomForestClassifier(random_state=99)
RFC.fit(X_sm, y_sm)
y_pred_RFC = RFC.predict(X_test)

In [21]:
acc_RFC = accuracy_score(y_test, y_pred_RFC)
pre_RFC = precision_score(y_test, y_pred_RFC)
rec_RFC = recall_score(y_test, y_pred_RFC)
f1_RFC = f1_score(y_test, y_pred_RFC)

In [22]:
print(classification_report(y_test, y_pred_RFC))

              precision    recall  f1-score   support

           0       0.97      0.64      0.77     17981
           1       0.09      0.63      0.16      1080

    accuracy                           0.64     19061
   macro avg       0.53      0.63      0.47     19061
weighted avg       0.92      0.64      0.74     19061



In [23]:
cm_RFC = confusion_matrix(y_test, y_pred_RFC, labels = [1,0])
df_RFC = pd.DataFrame(data = cm_RFC, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC

Unnamed: 0,Pred 1,Pred 0
Act 1,676,404
Act 0,6484,11497


### Hyperparameter Tuning

In [71]:
RFC_tuning = RandomForestClassifier(random_state=42)
para_RFC = {'n_estimators': [100, 50, 80, 120, 200],
           'max_depth': [None, 5, 10, 15],
           'min_samples_split': [2, 5, 10, 20],
           'min_samples_leaf': [1, 7, 15]}
RFC_tuning = GridSearchCV(estimator=RFC_tuning, param_grid=para_RFC, cv=3, n_jobs=-1, verbose=1, scoring='recall')

In [72]:
RFC_tuning.fit(X_sm, y_sm)
RFC_tuned = RFC_tuning.best_estimator_
y_pred_RFC_tuned = RFC_tuned.predict(X_test)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 15.1min finished


In [73]:
RFC_tuning.best_params_

{'max_depth': 5,
 'min_samples_leaf': 7,
 'min_samples_split': 2,
 'n_estimators': 50}

In [74]:
acc_RFC_tuned = accuracy_score(y_test, y_pred_RFC_tuned)
pre_RFC_tuned = precision_score(y_test, y_pred_RFC_tuned)
rec_RFC_tuned = recall_score(y_test, y_pred_RFC_tuned)
f1_RFC_tuned = f1_score(y_test, y_pred_RFC_tuned)

In [75]:
print(classification_report(y_test, y_pred_RFC_tuned))

              precision    recall  f1-score   support

           0       0.92      0.52      0.67     15427
           1       0.29      0.81      0.42      3634

    accuracy                           0.58     19061
   macro avg       0.60      0.67      0.54     19061
weighted avg       0.80      0.58      0.62     19061



In [76]:
cm_RFC_tuned = confusion_matrix(y_test, y_pred_RFC_tuned, labels = [1,0])
df_RFC_tuned = pd.DataFrame(data = cm_RFC_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,2944,690
Act 0,7373,8054
