In [1]:
#import libraries
import pandas as pd

import imblearn
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
#import data
df = pd.read_csv('med_appt_ml.csv')
df.sample(5)

Unnamed: 0,waiting,sms_received,scholarship,diabetes,hypertension,age,noshow
58565,0,0,0,0,0,64,0
88457,25,1,0,0,0,3,0
68295,9,0,0,0,0,10,1
18648,7,0,0,0,1,58,0
23798,28,1,0,0,1,34,0


In [3]:
# scale data to make the range become 0-1
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
df['age'] = mm.fit_transform(df[['age']])
df.sample(5)

Unnamed: 0,waiting,sms_received,scholarship,diabetes,hypertension,age,noshow
55021,6,0,0,0,0,0.386667,0
88213,15,1,0,0,0,0.16,0
49781,0,0,0,0,0,0.253333,0
21960,19,0,0,0,0,0.373333,1
51588,0,0,0,0,0,0.48,0


In [4]:
# split data

X = df.drop(columns='noshow')
y = df['noshow']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=42)

In [5]:
# imbalance handling
sm = SMOTE()
X_sm, y_sm = sm.fit_sample(X_train, y_train)

# Logistic Regression

### Base Model

In [6]:
# building model
LR = LogisticRegression()
LR.fit(X_sm, y_sm)
y_pred_LR = LR.predict(X_test)

In [7]:
# metrics
acc_LR = accuracy_score(y_test, y_pred_LR)
pre_LR = precision_score(y_test, y_pred_LR)
rec_LR = recall_score(y_test, y_pred_LR)
f1_LR = f1_score(y_test, y_pred_LR)

In [8]:
# classification report
print(classification_report(y_test, y_pred_LR))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79     15427
           1       0.32      0.56      0.41      3634

    accuracy                           0.69     19061
   macro avg       0.60      0.64      0.60     19061
weighted avg       0.77      0.69      0.72     19061



In [9]:
# actual-prediction table
cm_LR = confusion_matrix(y_test, y_pred_LR, labels = [1,0])
df_LR = pd.DataFrame(data = cm_LR , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,2022,1612
Act 0,4318,11109


### Hyperparameter Tuning

In [10]:
LR_tuning = LogisticRegression(random_state=42)
para_LR = {'C': [1, 0.01, 0.015,  0.1, 0.5, 1.2, 2],
           'penalty': ['l2', 'l1', 'elasticnet'], }
LR_tuning = GridSearchCV(estimator=LR_tuning, param_grid=para_LR, cv=3, n_jobs=-1 , verbose=1, scoring = 'recall')

In [11]:
LR_tuning.fit(X_sm, y_sm)
LR_tuned = LR_tuning.best_estimator_
y_pred_LR_tuned = LR_tuned.predict(X_test)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    2.7s finished


In [12]:
LR_tuning.best_params_

{'C': 1, 'penalty': 'l2'}

In [13]:
# metrics
acc_LR_tuned = accuracy_score(y_test, y_pred_LR_tuned)
pre_LR_tuned = precision_score(y_test, y_pred_LR_tuned)
rec_LR_tuned = recall_score(y_test, y_pred_LR_tuned)
f1_LR_tuned = f1_score(y_test, y_pred_LR_tuned)

In [14]:
# classification report
print(classification_report(y_test, y_pred_LR_tuned))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79     15427
           1       0.32      0.56      0.41      3634

    accuracy                           0.69     19061
   macro avg       0.60      0.64      0.60     19061
weighted avg       0.77      0.69      0.72     19061



In [15]:
# actual-prediction table
cm_LR_tuned = confusion_matrix(y_test, y_pred_LR_tuned, labels = [1,0])
df_LR_tuned = pd.DataFrame(data = cm_LR_tuned, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,2022,1612
Act 0,4318,11109


# K-Nearest Neighbors

### Base Model

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
KNN = KNeighborsClassifier()
KNN.fit(X_sm, y_sm)
y_pred_KNN = KNN.predict(X_test)

In [18]:
acc_KNN = accuracy_score(y_test, y_pred_KNN)
pre_KNN = precision_score(y_test, y_pred_KNN)
rec_KNN = recall_score(y_test, y_pred_KNN)
f1_KNN = f1_score(y_test, y_pred_KNN)

In [19]:
print(classification_report(y_test, y_pred_KNN))

              precision    recall  f1-score   support

           0       0.85      0.78      0.82     15427
           1       0.31      0.41      0.36      3634

    accuracy                           0.71     19061
   macro avg       0.58      0.60      0.59     19061
weighted avg       0.75      0.71      0.73     19061



In [20]:
cm_KNN = confusion_matrix(y_test, y_pred_KNN, labels = [1,0])
df_KNN = pd.DataFrame(data = cm_KNN, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN

Unnamed: 0,Pred 1,Pred 0
Act 1,1505,2129
Act 0,3331,12096


### Hyperparameter Tuning

In [21]:
KNN_tuning = KNeighborsClassifier()
param_KNN = {'n_neighbors': [5, 1, 10, 20],
             'weights': ['uniform', 'distance'], 
             'leaf_size': [30, 10, 50, 70],
             'p': [2,1]}
KNN_tuning = GridSearchCV(estimator=KNN_tuning, param_grid=param_KNN, cv=3, n_jobs=-1 , verbose=1, scoring='recall')

In [22]:
KNN_tuning.fit(X_sm, y_sm)
KNN_tuned = KNN_tuning.best_estimator_
y_pred_KNN_tuned = KNN_tuned.predict(X_test)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  7.6min finished


In [23]:
KNN_tuning.best_params_

{'leaf_size': 30, 'n_neighbors': 20, 'p': 1, 'weights': 'uniform'}

In [24]:
acc_KNN_tuned = accuracy_score(y_test, y_pred_KNN_tuned)
rec_KNN_tuned = recall_score(y_test, y_pred_KNN_tuned)
pre_KNN_tuned = precision_score(y_test, y_pred_KNN_tuned)
f1_KNN_tuned = f1_score(y_test, y_pred_KNN_tuned)

In [25]:
print(classification_report(y_test, y_pred_KNN_tuned))

              precision    recall  f1-score   support

           0       0.87      0.71      0.78     15427
           1       0.31      0.55      0.40      3634

    accuracy                           0.68     19061
   macro avg       0.59      0.63      0.59     19061
weighted avg       0.76      0.68      0.71     19061



In [26]:
cm_KNN_tuned = confusion_matrix(y_test, y_pred_KNN_tuned, labels = [1,0])
df_KNN_tuned = pd.DataFrame(data = cm_KNN_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,1990,1644
Act 0,4434,10993


# Random Forest

### Base Model

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
RFC = RandomForestClassifier(random_state=99)
RFC.fit(X_sm, y_sm)
y_pred_RFC = RFC.predict(X_test)

In [29]:
acc_RFC = accuracy_score(y_test, y_pred_RFC)
pre_RFC = precision_score(y_test, y_pred_RFC)
rec_RFC = recall_score(y_test, y_pred_RFC)
f1_RFC = f1_score(y_test, y_pred_RFC)

In [30]:
print(classification_report(y_test, y_pred_RFC))

              precision    recall  f1-score   support

           0       0.85      0.75      0.80     15427
           1       0.30      0.45      0.36      3634

    accuracy                           0.69     19061
   macro avg       0.58      0.60      0.58     19061
weighted avg       0.75      0.69      0.71     19061



In [31]:
cm_RFC = confusion_matrix(y_test, y_pred_RFC, labels = [1,0])
df_RFC = pd.DataFrame(data = cm_RFC, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC

Unnamed: 0,Pred 1,Pred 0
Act 1,1642,1992
Act 0,3877,11550


### Hyperparameter Tuning

In [32]:
RFC_tuning = RandomForestClassifier(random_state=42)
para_RFC = {'n_estimators': [100, 50, 80, 120, 200],
           'max_depth': [None, 5, 10, 15],
           'min_samples_split': [2, 5, 10, 20],
           'min_samples_leaf': [1, 7, 15]}
RFC_tuning = GridSearchCV(estimator=RFC_tuning, param_grid=para_RFC, cv=3, n_jobs=-1, verbose=1, scoring='recall')

In [33]:
RFC_tuning.fit(X_sm, y_sm)
RFC_tuned = RFC_tuning.best_estimator_
y_pred_RFC_tuned = RFC_tuned.predict(X_test)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 21.1min finished


In [34]:
RFC_tuning.best_params_

{'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

In [35]:
acc_RFC_tuned = accuracy_score(y_test, y_pred_RFC_tuned)
pre_RFC_tuned = precision_score(y_test, y_pred_RFC_tuned)
rec_RFC_tuned = recall_score(y_test, y_pred_RFC_tuned)
f1_RFC_tuned = f1_score(y_test, y_pred_RFC_tuned)

In [36]:
print(classification_report(y_test, y_pred_RFC_tuned))

              precision    recall  f1-score   support

           0       0.95      0.47      0.62     15427
           1       0.28      0.89      0.43      3634

    accuracy                           0.55     19061
   macro avg       0.61      0.68      0.53     19061
weighted avg       0.82      0.55      0.59     19061



In [37]:
cm_RFC_tuned = confusion_matrix(y_test, y_pred_RFC_tuned, labels = [1,0])
df_RFC_tuned = pd.DataFrame(data = cm_RFC_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,3227,407
Act 0,8237,7190


# Machine Learning Summary

In [38]:
evaluation={'accuracy': [acc_LR, acc_LR_tuned, acc_KNN, acc_KNN_tuned, acc_RFC, acc_RFC_tuned],
            'precision': [pre_LR, pre_LR_tuned, pre_KNN, pre_KNN_tuned, pre_RFC, pre_RFC_tuned],
            'recall': [rec_LR, rec_LR_tuned, rec_KNN, rec_KNN_tuned, rec_RFC, rec_RFC_tuned],
            'f1': [f1_LR, f1_LR_tuned, f1_KNN, f1_KNN_tuned, f1_RFC, f1_RFC_tuned]}

In [39]:
pd.DataFrame(evaluation, index=['Logistic Regression', 'Logistic Reg. Tuned', 'KNN', 'KNN Tuned', 'Random Forest', 'Random Forest Tuned'])

Unnamed: 0,accuracy,precision,recall,f1
Logistic Regression,0.688894,0.318927,0.556412,0.405454
Logistic Reg. Tuned,0.688894,0.318927,0.556412,0.405454
KNN,0.713551,0.311208,0.414144,0.355372
KNN Tuned,0.681129,0.309776,0.547606,0.395705
Random Forest,0.692094,0.297518,0.451844,0.358789
Random Forest Tuned,0.546509,0.28149,0.888002,0.427474


# Financial Calculation

- Medical check up cost may widely vary depending on what exams the patients take (up to 1300% difference for the same test)
    - https://veja.abril.com.br/saude/precos-de-exames-laboratoriais-podem-variar-1300/
    - The cost for routine check ranges from 100 - 600 Brazilian Real 
- Let us assume the medical check up is a complete check up (including cardial, lever, and urinal) for 340 Brazilian Real
    - https://www.drconsulta.com/servicos/checkups
    - If we missed 1 patient because of no-show, we may lose BRL340
- Let us assume that the recommended solution we give:
    - Reminder call and mailing cost BRL0.77 (https://olhardigital.com.br/noticia/ranking-revela-que-brasil-tem-a-ligacao-mais-cara/62835)
    - Hiring new doctor cost BRL21,000 per month, which each doctor can handle 400 patients per month (BRL53 per patient)
        - https://www.washingtonpost.com/news/to-your-health/wp/2014/05/22/how-many-patients-should-your-doctor-see-each-day/
        - adding doctor is a way to shorten waiting time 
    - For these solutions, let us assume we spend BRL55 at most
- The formula to count the effectiveness of machine learning is:
    - without ML: (340)*((number of show up)+(number of no-show))/total patient
    - with ML: 
        - ML cost: (340*FN)+(55*(TP+FP))
        - ((340*(number of show up)-(ML cost))/total patient
    - Revenue save = with ML - without ML

**USING RANDOM FOREST (HYPERPARAMETER TUNING)**
- without ML:
    - Total Revenue (19061 patients) = BRL4,009,620
    - Single Revenue = BRL210/patient
- with ML:
    - ML cost (19061 patients) = BRL768,900
    - Total Revenue (19061 patients) = BRL5,573,460
    - Single Revenue = BRL292/patient
- Revenue Increase after using ML model
    - Total (for 19061 patient) = BRL1,563,840 = USD287,391
    - Single = BRL82/patient = USD15/patient

# Exporting Model

In [65]:
import joblib

In [66]:
filename = 'finalized_model.sav'
joblib.dump(RFC_tuning, filename)

['finalized_model.sav']

In [71]:
loaded_model = joblib.load(filename)
result = loaded_model.predict([[20, 0, 0, 0, 0, 25]])
print(result)

[1]


In [72]:
loaded_model = joblib.load(filename)
result = loaded_model.predict_proba([[20, 0, 0, 0, 0, 25]])
print(result)

[[0.49121596 0.50878404]]
