In [1]:
#import libraries
import pandas as pd

import imblearn
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
#import data
df = pd.read_csv('med_appt_ml.csv')
df.sample(5)

Unnamed: 0,waiting,sms_received,scholarship,diabetes,hypertension,age,noshow
37822,5,0,1,0,0,5,1
38290,5,1,0,0,0,45,0
75946,22,1,0,0,0,25,0
15051,4,1,0,0,0,20,0
78539,7,1,0,0,0,34,0


In [3]:
# split data

X = df.drop(columns='noshow')
y = df['noshow']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=42)

In [4]:
# imbalance handling
sm = SMOTE()
X_sm, y_sm = sm.fit_sample(X_train, y_train)

# Logistic Regression

### Base Model

In [5]:
# building model
LR = LogisticRegression()
LR.fit(X_sm, y_sm)
y_pred_LR = LR.predict(X_test)

In [6]:
# metrics
acc_LR = accuracy_score(y_test, y_pred_LR)
pre_LR = precision_score(y_test, y_pred_LR)
rec_LR = recall_score(y_test, y_pred_LR)
f1_LR = f1_score(y_test, y_pred_LR)

In [7]:
# classification report
print(classification_report(y_test, y_pred_LR))

              precision    recall  f1-score   support

           0       0.87      0.73      0.79     15427
           1       0.32      0.53      0.40      3634

    accuracy                           0.69     19061
   macro avg       0.59      0.63      0.59     19061
weighted avg       0.76      0.69      0.72     19061



In [8]:
# actual-prediction table
cm_LR = confusion_matrix(y_test, y_pred_LR, labels = [1,0])
df_LR = pd.DataFrame(data = cm_LR , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,1935,1699
Act 0,4198,11229


### Hyperparameter Tuning

In [9]:
LR_tuning = LogisticRegression(random_state=42)
para_LR = {'C': [1, 0.01, 0.015,  0.1, 0.5, 1.2, 2],
           'penalty': ['l2', 'l1', 'elasticnet'], }
LR_tuning = GridSearchCV(estimator=LR_tuning, param_grid=para_LR, cv=3, n_jobs=-1 , verbose=1, scoring = 'recall')

In [10]:
LR_tuning.fit(X_sm, y_sm)
LR_tuned = LR_tuning.best_estimator_
y_pred_LR_tuned = LR_tuned.predict(X_test)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    4.2s finished


In [11]:
LR_tuning.best_params_

{'C': 1.2, 'penalty': 'l2'}

In [12]:
# metrics
acc_LR_tuned = accuracy_score(y_test, y_pred_LR_tuned)
pre_LR_tuned = precision_score(y_test, y_pred_LR_tuned)
rec_LR_tuned = recall_score(y_test, y_pred_LR_tuned)
f1_LR_tuned = f1_score(y_test, y_pred_LR_tuned)

In [13]:
# classification report
print(classification_report(y_test, y_pred_LR_tuned))

              precision    recall  f1-score   support

           0       0.87      0.73      0.79     15427
           1       0.32      0.53      0.40      3634

    accuracy                           0.69     19061
   macro avg       0.59      0.63      0.59     19061
weighted avg       0.76      0.69      0.72     19061



In [14]:
# actual-prediction table
cm_LR_tuned = confusion_matrix(y_test, y_pred_LR_tuned, labels = [1,0])
df_LR_tuned = pd.DataFrame(data = cm_LR_tuned, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,1935,1699
Act 0,4198,11229


# K-Nearest Neighbors

### Base Model

In [15]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
KNN = KNeighborsClassifier()
KNN.fit(X_sm, y_sm)
y_pred_KNN = KNN.predict(X_test)

In [17]:
acc_KNN = accuracy_score(y_test, y_pred_KNN)
pre_KNN = precision_score(y_test, y_pred_KNN)
rec_KNN = recall_score(y_test, y_pred_KNN)
f1_KNN = f1_score(y_test, y_pred_KNN)

In [18]:
print(classification_report(y_test, y_pred_KNN))

              precision    recall  f1-score   support

           0       0.87      0.65      0.74     15427
           1       0.28      0.58      0.38      3634

    accuracy                           0.64     19061
   macro avg       0.57      0.61      0.56     19061
weighted avg       0.76      0.64      0.67     19061



In [19]:
cm_KNN = confusion_matrix(y_test, y_pred_KNN, labels = [1,0])
df_KNN = pd.DataFrame(data = cm_KNN, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN

Unnamed: 0,Pred 1,Pred 0
Act 1,2094,1540
Act 0,5375,10052


### Hyperparameter Tuning

In [20]:
KNN_tuning = KNeighborsClassifier()
param_KNN = {'n_neighbors': [5, 1, 10, 20],
             'weights': ['uniform', 'distance'], 
             'leaf_size': [30, 10, 50, 70],
             'p': [2,1]}
KNN_tuning = GridSearchCV(estimator=KNN_tuning, param_grid=param_KNN, cv=3, n_jobs=-1 , verbose=1, scoring='recall')

In [22]:
KNN_tuning.fit(X_sm, y_sm)
KNN_tuned = KNN_tuning.best_estimator_
y_pred_KNN_tuned = KNN_tuned.predict(X_test)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  1.5min finished


In [23]:
KNN_tuning.best_params_

{'leaf_size': 50, 'n_neighbors': 20, 'p': 1, 'weights': 'uniform'}

In [24]:
acc_KNN_tuned = accuracy_score(y_test, y_pred_KNN_tuned)
rec_KNN_tuned = recall_score(y_test, y_pred_KNN_tuned)
pre_KNN_tuned = precision_score(y_test, y_pred_KNN_tuned)
f1_KNN_tuned = f1_score(y_test, y_pred_KNN_tuned)

In [25]:
print(classification_report(y_test, y_pred_KNN_tuned))

              precision    recall  f1-score   support

           0       0.88      0.64      0.74     15427
           1       0.29      0.64      0.40      3634

    accuracy                           0.64     19061
   macro avg       0.59      0.64      0.57     19061
weighted avg       0.77      0.64      0.67     19061



In [26]:
cm_KNN_tuned = confusion_matrix(y_test, y_pred_KNN_tuned, labels = [1,0])
df_KNN_tuned = pd.DataFrame(data = cm_KNN_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,2335,1299
Act 0,5629,9798


# Random Forest

### Base Model

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
RFC = RandomForestClassifier(random_state=99)
RFC.fit(X_sm, y_sm)
y_pred_RFC = RFC.predict(X_test)

In [29]:
acc_RFC = accuracy_score(y_test, y_pred_RFC)
pre_RFC = precision_score(y_test, y_pred_RFC)
rec_RFC = recall_score(y_test, y_pred_RFC)
f1_RFC = f1_score(y_test, y_pred_RFC)

In [30]:
print(classification_report(y_test, y_pred_RFC))

              precision    recall  f1-score   support

           0       0.87      0.68      0.76     15427
           1       0.29      0.57      0.39      3634

    accuracy                           0.66     19061
   macro avg       0.58      0.62      0.57     19061
weighted avg       0.76      0.66      0.69     19061



In [31]:
cm_RFC = confusion_matrix(y_test, y_pred_RFC, labels = [1,0])
df_RFC = pd.DataFrame(data = cm_RFC, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC

Unnamed: 0,Pred 1,Pred 0
Act 1,2057,1577
Act 0,4965,10462


### Hyperparameter Tuning

In [32]:
RFC_tuning = RandomForestClassifier(random_state=42)
para_RFC = {'n_estimators': [100, 50, 80, 120, 200],
           'max_depth': [None, 5, 10, 15],
           'min_samples_split': [2, 5, 10, 20],
           'min_samples_leaf': [1, 7, 15]}
RFC_tuning = GridSearchCV(estimator=RFC_tuning, param_grid=para_RFC, cv=3, n_jobs=-1, verbose=1, scoring='recall')

In [33]:
RFC_tuning.fit(X_sm, y_sm)
RFC_tuned = RFC_tuning.best_estimator_
y_pred_RFC_tuned = RFC_tuned.predict(X_test)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 17.2min finished


In [34]:
RFC_tuning.best_params_

{'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 100}

In [35]:
acc_RFC_tuned = accuracy_score(y_test, y_pred_RFC_tuned)
pre_RFC_tuned = precision_score(y_test, y_pred_RFC_tuned)
rec_RFC_tuned = recall_score(y_test, y_pred_RFC_tuned)
f1_RFC_tuned = f1_score(y_test, y_pred_RFC_tuned)

In [36]:
print(classification_report(y_test, y_pred_RFC_tuned))

              precision    recall  f1-score   support

           0       0.92      0.53      0.67     15427
           1       0.29      0.81      0.42      3634

    accuracy                           0.58     19061
   macro avg       0.60      0.67      0.55     19061
weighted avg       0.80      0.58      0.62     19061



In [37]:
cm_RFC_tuned = confusion_matrix(y_test, y_pred_RFC_tuned, labels = [1,0])
df_RFC_tuned = pd.DataFrame(data = cm_RFC_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,2939,695
Act 0,7293,8134


# Machine Learning Summary

In [38]:
evaluation={'accuracy': [acc_LR, acc_LR_tuned, acc_KNN, acc_KNN_tuned, acc_RFC, acc_RFC_tuned],
            'precision': [pre_LR, pre_LR_tuned, pre_KNN, pre_KNN_tuned, pre_RFC, pre_RFC_tuned],
            'recall': [rec_LR, rec_LR_tuned, rec_KNN, rec_KNN_tuned, rec_RFC, rec_RFC_tuned],
            'f1': [f1_LR, f1_LR_tuned, f1_KNN, f1_KNN_tuned, f1_RFC, f1_RFC_tuned]}

In [41]:
pd.DataFrame(evaluation, index=['Logistic Regression', 'Logistic Reg. Tuned', 'KNN', 'KNN Tuned', 'Random Forest', 'Random Forest Tuned'])

Unnamed: 0,accuracy,precision,recall,f1
Logistic Regression,0.690625,0.315506,0.532471,0.396232
Logistic Reg. Tuned,0.690625,0.315506,0.532471,0.396232
KNN,0.637217,0.280359,0.576225,0.377195
KNN Tuned,0.636535,0.293194,0.642543,0.402656
Random Forest,0.656786,0.292936,0.566043,0.386074
Random Forest Tuned,0.580924,0.287236,0.808751,0.423915


# Financial Calculation