In [5]:
#import libraries
import pandas as pd

import imblearn
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve

import warnings
warnings.filterwarnings('ignore')

In [6]:
#import data
df = pd.read_csv('03 - med_appt_ml.csv')
df.sample(5)

Unnamed: 0,waiting,sms_received,scholarship,diabetes,hypertension,age,noshow
17120,0,0,0,0,1,66,0
62841,11,0,0,0,0,46,0
38762,15,1,0,0,0,0,0
19635,0,0,1,0,1,47,0
46574,0,0,0,0,0,19,1


In [7]:
# scale data to make the range become 0-1
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
df['age'] = mm.fit_transform(df[['age']])
df.sample(5)

Unnamed: 0,waiting,sms_received,scholarship,diabetes,hypertension,age,noshow
47771,0,0,0,0,0,0.24,0
42178,27,1,0,0,0,0.426667,0
75340,0,0,0,0,0,0.76,0
79644,14,1,1,0,0,0.373333,0
43320,0,0,0,0,0,0.373333,0


In [8]:
# split data

X = df.drop(columns='noshow')
y = df['noshow']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=42)

In [6]:
# imbalance handling
sm = SMOTE()
X_sm, y_sm = sm.fit_sample(X_train, y_train)

In [9]:
X_test.head()

Unnamed: 0,waiting,sms_received,scholarship,diabetes,hypertension,age
20989,28,1,1,0,0,0.573333
79716,0,0,1,0,0,0.506667
9818,0,0,0,0,0,0.826667
79491,2,0,0,0,0,0.8
11225,0,0,0,0,0,0.453333


In [None]:
X.loc[20989]

# Logistic Regression

### Base Model

In [34]:
# building model
LR = LogisticRegression()
LR.fit(X_sm, y_sm)
y_pred_LR = LR.predict(X_test)

In [60]:
# metrics
acc_LR = accuracy_score(y_test, y_pred_LR)
pre_LR = precision_score(y_test, y_pred_LR)
rec_LR = recall_score(y_test, y_pred_LR)
f1_LR = f1_score(y_test, y_pred_LR)
ROC_LR = roc_auc_score(y_test, LR.predict_proba(X_test)[:,1])

In [36]:
# classification report
print(classification_report(y_test, y_pred_LR))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79     15427
           1       0.32      0.56      0.41      3634

    accuracy                           0.69     19061
   macro avg       0.60      0.64      0.60     19061
weighted avg       0.77      0.69      0.71     19061



In [37]:
# actual-prediction table
cm_LR = confusion_matrix(y_test, y_pred_LR, labels = [1,0])
df_LR = pd.DataFrame(data = cm_LR , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,2051,1583
Act 0,4391,11036


### Hyperparameter Tuning

In [38]:
LR_tuning = LogisticRegression(random_state=42)
para_LR = {'C': [1, 0.01, 0.015,  0.1, 0.5, 1.2, 2],
           'penalty': ['l2', 'l1', 'elasticnet'], }
LR_tuning = GridSearchCV(estimator=LR_tuning, param_grid=para_LR, cv=3, n_jobs=-1 , verbose=1, scoring = 'recall')

In [39]:
LR_tuning.fit(X_sm, y_sm)
LR_tuned = LR_tuning.best_estimator_
y_pred_LR_tuned = LR_tuned.predict(X_test)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  63 | elapsed:    1.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    1.6s finished


In [40]:
LR_tuning.best_params_

{'C': 1.2, 'penalty': 'l2'}

In [61]:
# metrics
acc_LR_tuned = accuracy_score(y_test, y_pred_LR_tuned)
pre_LR_tuned = precision_score(y_test, y_pred_LR_tuned)
rec_LR_tuned = recall_score(y_test, y_pred_LR_tuned)
f1_LR_tuned = f1_score(y_test, y_pred_LR_tuned)
ROC_LR_tuned = roc_auc_score(y_test, LR_tuned.predict_proba(X_test)[:,1])

In [42]:
# classification report
print(classification_report(y_test, y_pred_LR_tuned))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79     15427
           1       0.32      0.56      0.41      3634

    accuracy                           0.69     19061
   macro avg       0.60      0.64      0.60     19061
weighted avg       0.77      0.69      0.71     19061



In [43]:
# actual-prediction table
cm_LR_tuned = confusion_matrix(y_test, y_pred_LR_tuned, labels = [1,0])
df_LR_tuned = pd.DataFrame(data = cm_LR_tuned, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_LR

Unnamed: 0,Pred 1,Pred 0
Act 1,2051,1583
Act 0,4391,11036


# K-Nearest Neighbors

### Base Model

In [44]:
from sklearn.neighbors import KNeighborsClassifier

In [45]:
KNN = KNeighborsClassifier()
KNN.fit(X_sm, y_sm)
y_pred_KNN = KNN.predict(X_test)

In [62]:
acc_KNN = accuracy_score(y_test, y_pred_KNN)
pre_KNN = precision_score(y_test, y_pred_KNN)
rec_KNN = recall_score(y_test, y_pred_KNN)
f1_KNN = f1_score(y_test, y_pred_KNN)
ROC_KNN = roc_auc_score(y_test, KNN.predict_proba(X_test)[:,1])

In [47]:
print(classification_report(y_test, y_pred_KNN))

              precision    recall  f1-score   support

           0       0.85      0.79      0.82     15427
           1       0.31      0.39      0.34      3634

    accuracy                           0.72     19061
   macro avg       0.58      0.59      0.58     19061
weighted avg       0.74      0.72      0.73     19061



In [48]:
cm_KNN = confusion_matrix(y_test, y_pred_KNN, labels = [1,0])
df_KNN = pd.DataFrame(data = cm_KNN, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN

Unnamed: 0,Pred 1,Pred 0
Act 1,1400,2234
Act 0,3188,12239


### Hyperparameter Tuning

In [49]:
KNN_tuning = KNeighborsClassifier()
param_KNN = {'n_neighbors': [5, 1, 10, 20],
             'weights': ['uniform', 'distance'], 
             'leaf_size': [30, 10, 50, 70],
             'p': [2,1]}
KNN_tuning = GridSearchCV(estimator=KNN_tuning, param_grid=param_KNN, cv=3, n_jobs=-1 , verbose=1, scoring='recall')

In [50]:
KNN_tuning.fit(X_sm, y_sm)
KNN_tuned = KNN_tuning.best_estimator_
y_pred_KNN_tuned = KNN_tuned.predict(X_test)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  7.3min finished


In [51]:
KNN_tuning.best_params_

{'leaf_size': 10, 'n_neighbors': 20, 'p': 1, 'weights': 'uniform'}

In [63]:
acc_KNN_tuned = accuracy_score(y_test, y_pred_KNN_tuned)
rec_KNN_tuned = recall_score(y_test, y_pred_KNN_tuned)
pre_KNN_tuned = precision_score(y_test, y_pred_KNN_tuned)
f1_KNN_tuned = f1_score(y_test, y_pred_KNN_tuned)
ROC_KNN_tuned = roc_auc_score(y_test, KNN_tuned.predict_proba(X_test)[:,1])

In [53]:
print(classification_report(y_test, y_pred_KNN_tuned))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79     15427
           1       0.31      0.53      0.39      3634

    accuracy                           0.69     19061
   macro avg       0.59      0.63      0.59     19061
weighted avg       0.76      0.69      0.71     19061



In [54]:
cm_KNN_tuned = confusion_matrix(y_test, y_pred_KNN_tuned, labels = [1,0])
df_KNN_tuned = pd.DataFrame(data = cm_KNN_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_KNN_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,1942,1692
Act 0,4297,11130


# Random Forest

### Base Model

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
RFC = RandomForestClassifier(random_state=99)
RFC.fit(X_sm, y_sm)
y_pred_RFC = RFC.predict(X_test)

In [64]:
acc_RFC = accuracy_score(y_test, y_pred_RFC)
pre_RFC = precision_score(y_test, y_pred_RFC)
rec_RFC = recall_score(y_test, y_pred_RFC)
f1_RFC = f1_score(y_test, y_pred_RFC)
ROC_RFC = roc_auc_score(y_test, RFC.predict_proba(X_test)[:,1])

In [10]:
print(classification_report(y_test, y_pred_RFC))

              precision    recall  f1-score   support

           0       0.85      0.75      0.80     15427
           1       0.30      0.44      0.35      3634

    accuracy                           0.69     19061
   macro avg       0.57      0.60      0.58     19061
weighted avg       0.75      0.69      0.71     19061



In [11]:
cm_RFC = confusion_matrix(y_test, y_pred_RFC, labels = [1,0])
df_RFC = pd.DataFrame(data = cm_RFC, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC

Unnamed: 0,Pred 1,Pred 0
Act 1,1613,2021
Act 0,3850,11577


### Hyperparameter Tuning

In [124]:
RFC_tuning = RandomForestClassifier(random_state=42)
para_RFC = {'n_estimators': [100, 200, 250, 300],
           'max_depth': [None, 5, 20,50],
           'min_samples_split': [2, 5, 10, 20],
           'min_samples_leaf': [1, 7, 15]}
RFC_tuning = GridSearchCV(estimator=RFC_tuning, param_grid=para_RFC, cv=3, n_jobs=-1, verbose=1, scoring='recall')

In [125]:
RFC_tuning.fit(X_sm, y_sm)
RFC_tuned = RFC_tuning.best_estimator_
y_pred_RFC_tuned = RFC_tuned.predict(X_test)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed: 31.8min finished


In [128]:
RFC_tuning.best_params_

{'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 20,
 'n_estimators': 200}

In [129]:
acc_RFC_tuned = accuracy_score(y_test, y_pred_RFC_tuned)
pre_RFC_tuned = precision_score(y_test, y_pred_RFC_tuned)
rec_RFC_tuned = recall_score(y_test, y_pred_RFC_tuned)
f1_RFC_tuned = f1_score(y_test, y_pred_RFC_tuned)
ROC_RFC_tuned = roc_auc_score(y_test, RFC_tuned.predict_proba(X_test)[:,1])

In [130]:
print(classification_report(y_test, y_pred_RFC_tuned))

              precision    recall  f1-score   support

           0       0.95      0.46      0.62     15427
           1       0.28      0.89      0.43      3634

    accuracy                           0.55     19061
   macro avg       0.61      0.68      0.52     19061
weighted avg       0.82      0.55      0.59     19061



In [131]:
cm_RFC_tuned = confusion_matrix(y_test, y_pred_RFC_tuned, labels = [1,0])
df_RFC_tuned = pd.DataFrame(data = cm_RFC_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_RFC_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,3231,403
Act 0,8268,7159


# XGBoost Classifier

### Base Model (Without Tuning)

In [20]:
from xgboost import XGBClassifier

In [21]:
XGB = XGBClassifier()
XGB.fit(X_sm, y_sm)
y_pred_XGB = XGB.predict(X_test)

In [66]:
acc_XGB = accuracy_score(y_test, y_pred_XGB)
pre_XGB = precision_score(y_test, y_pred_XGB)
rec_XGB = recall_score(y_test, y_pred_XGB)
f1_XGB = f1_score(y_test, y_pred_XGB)
ROC_XGB = roc_auc_score(y_test, XGB.predict_proba(X_test)[:,1])

In [23]:
print(classification_report(y_test, y_pred_XGB))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79     15427
           1       0.31      0.54      0.39      3634

    accuracy                           0.69     19061
   macro avg       0.59      0.63      0.59     19061
weighted avg       0.76      0.69      0.71     19061



In [25]:
cm_XGB = confusion_matrix(y_test, y_pred_XGB, labels = [1,0])
df_XGB = pd.DataFrame(data = cm_XGB, index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_XGB

Unnamed: 0,Pred 1,Pred 0
Act 1,1956,1678
Act 0,4315,11112


### Hyperparameter Tuning

In [28]:
XGB_tuning = XGBClassifier()
para_XGB = {'n_estimators': [100,500,1000],
           'max_depth': [None, 5, 25],
           'min_samples_split': [2, 5, 10, 20],
            'learning_rate': [0.1, 0.05, 0.3]}
XGB_tuning = GridSearchCV(estimator=XGB_tuning, param_grid=para_XGB, cv=3, n_jobs=-1, verbose=1, scoring='recall')

In [29]:
XGB_tuning.fit(X_sm, y_sm)
XGB_tuned = XGB_tuning.best_estimator_
y_pred_XGB_tuned = XGB_tuned.predict(X_test)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 38.7min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 87.0min finished


Parameters: { min_samples_split } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [30]:
XGB_tuning.best_params_

{'learning_rate': 0.05,
 'max_depth': 5,
 'min_samples_split': 2,
 'n_estimators': 100}

In [67]:
acc_XGB_tuned = accuracy_score(y_test, y_pred_XGB_tuned)
pre_XGB_tuned = precision_score(y_test, y_pred_XGB_tuned)
rec_XGB_tuned = recall_score(y_test, y_pred_XGB_tuned)
f1_XGB_tuned = f1_score(y_test, y_pred_XGB_tuned)
ROC_XGB_tuned = roc_auc_score(y_test, XGB_tuned.predict_proba(X_test)[:,1])

In [32]:
print(classification_report(y_test, y_pred_XGB_tuned))

              precision    recall  f1-score   support

           0       0.93      0.54      0.68     15427
           1       0.30      0.81      0.43      3634

    accuracy                           0.60     19061
   macro avg       0.61      0.68      0.56     19061
weighted avg       0.81      0.60      0.64     19061



In [33]:
cm_XGB_tuned = confusion_matrix(y_test, y_pred_XGB_tuned, labels = [1,0])
df_XGB_tuned = pd.DataFrame(data = cm_XGB_tuned , index = ["Act 1","Act 0"], columns = ["Pred 1", "Pred 0"])
df_XGB_tuned

Unnamed: 0,Pred 1,Pred 0
Act 1,2956,678
Act 0,7041,8386


# Machine Learning Summary

In [68]:
evaluation={'accuracy': [acc_LR, acc_LR_tuned, acc_KNN, acc_KNN_tuned, acc_RFC, acc_RFC_tuned, acc_XGB, acc_XGB_tuned],
            'precision': [pre_LR, pre_LR_tuned, pre_KNN, pre_KNN_tuned, pre_RFC, pre_RFC_tuned, pre_XGB, pre_XGB_tuned],
            'recall': [rec_LR, rec_LR_tuned, rec_KNN, rec_KNN_tuned, rec_RFC, rec_RFC_tuned, rec_XGB, rec_XGB_tuned],
            'f1': [f1_LR, f1_LR_tuned, f1_KNN, f1_KNN_tuned, f1_RFC, f1_RFC_tuned, f1_XGB, f1_XGB_tuned],
            'ROC':[ROC_LR, ROC_LR_tuned, ROC_KNN, ROC_KNN_tuned, ROC_RFC, ROC_RFC_tuned, ROC_XGB, ROC_XGB_tuned]}

In [69]:
pd.DataFrame(evaluation, index=['Logistic Regression', 'Logistic Reg. Tuned', 'KNN', 'KNN Tuned', 'Random Forest', 'Random Forest Tuned', 'XGB', 'XGB Tuned'])

Unnamed: 0,accuracy,precision,recall,f1,ROC
Logistic Regression,0.686585,0.318379,0.564392,0.407106,0.691502
Logistic Reg. Tuned,0.686585,0.318379,0.564392,0.407106,0.691502
KNN,0.715545,0.305144,0.38525,0.34055,0.661841
KNN Tuned,0.685798,0.311268,0.534397,0.393396,0.707826
Random Forest,0.691989,0.295259,0.443864,0.354622,0.646244
Random Forest Tuned,0.545092,0.280981,0.889103,0.427014,0.726649
XGB,0.685588,0.311912,0.53825,0.394952,0.709255
XGB Tuned,0.595037,0.295689,0.813429,0.433717,0.730141


**Machine Learning ROC Rank**
1. XGBoost Classifier, Tuned (.730)
2. Random Forest Classifier, Tuned (.727)
3. XGB Classifier, Base (.709)

**Machine Learning Recall Score Rank**
1. Random Forest Classifier, Tuned (.889)
2. XGB Classifier, Tuned (.813)
3. XGB Classifier, Base (.538)

**CONSIDERATION**
- Both ROC score and recall score are essential to decide the best model
- There are 2 candidates, XGB Tuned and RFC Tuned
- We cannot see which one is more beneficial, therefore we need to run financial projection using those model

# Financial Calculation

### Assumption
- Medical check up cost may widely vary depending on what exams the patients take (up to 1300% difference for the same test)
    - https://veja.abril.com.br/saude/precos-de-exames-laboratoriais-podem-variar-1300/
    - The cost for routine check ranges from 100 - 600 Brazilian Real 
- Let us assume the medical check up is a complete check up (including cardial, lever, and urinal) for 340 Brazilian Real (average in Espirito Santo)
    - https://www.drconsulta.com/servicos/checkups
    - If we missed 1 patient because of no-show, we may lose BRL340
- Let us assume that the recommended solution we give:
    - Reminder call and mailing cost BRL0.77 (https://olhardigital.com.br/noticia/ranking-revela-que-brasil-tem-a-ligacao-mais-cara/62835)
    - Hiring new doctor cost BRL21,000 per month, which each doctor can handle 400 patients per month (BRL53 per patient)
        - https://www.washingtonpost.com/news/to-your-health/wp/2014/05/22/how-many-patients-should-your-doctor-see-each-day/
        - adding doctor is a way to shorten waiting time 
    - For these solutions, let us assume we spend BRL55 at most

### Calculation for Revenue Saving
- The formula to count the effectiveness of machine learning is:
    - without ML: (340)*(number of show up)/total patient
    - with ML: 
        - ML cost: (340*FN)+(55*(TP+FP))
        - ((340*(number of show up)-(ML cost))/total patient
    - Revenue save = with ML - without ML

**USING RANDOM FOREST CLASSIFIER (HYPERPARAMETER TUNING)**
- without ML:
    - Total Revenue (19061 patients) = BRL5,245,180
    - Single Revenue = BRL275.12/patient registered
- with ML:
    - ML cost (19061 patients) = BRL769,465
    - Total Revenue (19061 patients) = BRL6,480,740 - BRL769,465 = BRL5,711,275
    - Single Revenue = BRL299.63/patient
- Revenue Increase after using ML model
    - Total (for 19061 patient) = BRL466,095 = **USD87,210.24**
    - Single = BRL24.5/patient = USD5/patient

**USING XGB CLASSIFIER (HYPERPARAMETER TUNING)**
- without ML:
    - Total Revenue (19061 patients) = BRL5,245,180
    - Single Revenue = BRL275.12/patient registered
- with ML:
    - ML cost (19061 patients) = BRL780,355
    - Total Revenue (19061 patients) = BRL6,480,740 - BRL780,355 = BRL5,700,385
    - Single Revenue = BRL299.06/patient
- Revenue Increase after using ML model
    - Total (for 19061 patient) = BRL455,205 = **USD85,172.63**
    - Single = BRL23.88/patient = USD5/patient

### Summary
- Random Forest save about USD 87,210.84
- XGB save about USD 85,172.63
- Random Forest is more beneficial

# Exporting Model

In [119]:
import joblib

In [132]:
filename = 'finalized_model.sav'
joblib.dump(RFC_tuning, filename)

['finalized_model.sav']

In [134]:
loaded_model = joblib.load('finalized_model.sav')
result = loaded_model.predict([[20, 0, 0, 0, 0, 0.3]])
print(result)

[1]


In [135]:
loaded_model = joblib.load(filename)
result = loaded_model.predict_proba([[20, 0, 0, 0, 0, 0.3]])
print(result)

[[0.33378682 0.66621318]]
