In [125]:
import warnings 
warnings.filterwarnings('ignore')

In [126]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

## 1. Loading Data and Inspecting

In [127]:
data = pd.read_csv('KaggleV2-May-2016.csv')

In [128]:
data.describe()

Unnamed: 0,PatientId,AppointmentID,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
count,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0
mean,147496300000000.0,5675305.0,37.088874,0.098266,0.197246,0.071865,0.0304,0.022248,0.321026
std,256094900000000.0,71295.75,23.110205,0.297675,0.397921,0.258265,0.171686,0.161543,0.466873
min,39217.84,5030230.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172614000000.0,5640286.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31731840000000.0,5680573.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94391720000000.0,5725524.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999981600000000.0,5790484.0,115.0,1.0,1.0,1.0,1.0,4.0,1.0


In [129]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


The dataset is imbalanced (approx. 80/20 ratio of No/Yes).  Measuring classifier accuracy is probably not useful unless the accuracy is well over 80%, or the dataset is resampled.  This can be done after preparing data for modelling?

In [130]:
no = len(data[data['No-show']=='No'])
yes = len(data[data['No-show']=='Yes'])
total = len(data)

print('Number of Rows with No-Show = Yes: ', yes,'(',np.round(yes/total*100,2),'%), Number of rows with No-show = No: ', no,'(',np.round(no/total*100,2),'%)')

Number of Rows with No-Show = Yes:  22319 ( 20.19 %), Number of rows with No-show = No:  88208 ( 79.81 %)


## 2. Editing/Adding Features and Labels

**New Features Created:**

"days_to_appt" - days between scheduling date and appt date (numerical)

"day_of_appt" - day of the week of the appt (categorical)

"month_of_appt" - month of the year of the appt (categorical)

"day_of_scheduling" - day of week appt was scheduled on (categorical)

"month_of_scheduling" - month of year appt was scheduled on (categorical)

**Final Features/Labels of Interest:**

Numerical: Age, days_to_appt, Handcap

Categorical: Gender, Hipertension, Diabetes, Alcoholism, SMS_received, day_of_appt, month_of_appt, day_of_scheduling, month_of_scheduling 

Labels: No-show (Yes=1, No=0)



In [131]:
# convert date objects to datetime format (without time part)
data['ScheduledDayDT'] = pd.to_datetime(data['ScheduledDay']).dt.date
data['ScheduledDayDT'] = pd.to_datetime(data['ScheduledDayDT'])
data['AppointmentDayDT'] = pd.to_datetime(data['AppointmentDay'])

In [132]:
# create new feature - "days_to_appt" - no. of days between scheduled date and appt date
data['time_between_appts'] = data['AppointmentDayDT']-data['ScheduledDayDT']
data['days_to_appt'] = data['time_between_appts']/np.timedelta64(1,'D')

In [133]:
# create new features - day of the week and month of the year of the scheduling and appointment dates
data['day_of_appt'] = data['AppointmentDayDT'].dt.dayofweek
data['day_of_scheduling'] = data['ScheduledDayDT'].dt.dayofweek
data['month_of_appt'] = data['AppointmentDayDT'].dt.month
data['month_of_scheduling'] = data['ScheduledDayDT'].dt.month

In [134]:
# map labels to 1/0
data['Labels'] = data['No-show'].map({'Yes': 1, 'No': 0})

In [135]:
# keep only desired features and labels
data_clean = data[['Gender','Age','Hipertension','Diabetes','Alcoholism','Handcap','SMS_received',
                  'Labels','day_of_appt','day_of_scheduling','days_to_appt','month_of_appt','month_of_scheduling']]

In [136]:
data_clean.head()

Unnamed: 0,Gender,Age,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,Labels,day_of_appt,day_of_scheduling,days_to_appt,month_of_appt,month_of_scheduling
0,F,62,1,0,0,0,0,0,4,4,0.0,4,4
1,M,56,0,0,0,0,0,0,4,4,0.0,4,4
2,F,62,0,0,0,0,0,0,4,4,0.0,4,4
3,F,8,0,0,0,0,0,0,4,4,0.0,4,4
4,F,56,1,1,0,0,0,0,4,4,0.0,4,4


## 3. Prepare for Modelling

The cleaned data is divided into train/test sets (70/30) ratio. OneHotEncoder() is used for all categorical features and StandardScaler() for numerical features.

In [137]:
# This function prepares the datafame, dividing into train/test, seperating features from labels, and scaling/one hot encoding
# Input is dataframe and test ratio.  Ouput is train/test sets divided into features and labels.
def data_prepare(data,test_ratio):
    # split into train/test
    train_set,test_set = train_test_split(data,test_size=test_ratio)
    print(len(train_set), "train +", len(test_set), "test")
    
    # divide into features/labels
    train_features = train_set.drop('Labels',axis=1)
    train_labels = train_set[['Labels']]
    test_features = test_set.drop('Labels',axis=1)
    test_labels = test_set[['Labels']]
    
    # transform numerical and categorical features using StandardScaler() and OneHotEncoder()
    num_attribs = ['Age','days_to_appt','Handcap']
    cat_attribs = ['Gender','Hipertension','Diabetes','Alcoholism','SMS_received','day_of_appt',
              'month_of_appt','day_of_scheduling','month_of_scheduling']
    pipeline_features = ColumnTransformer([("num",StandardScaler(),num_attribs),("cat",OneHotEncoder(handle_unknown='ignore'),cat_attribs)])
    
    train_features_prepared = pipeline_features.fit_transform(train_features)
    test_features_prepared = pipeline_features.transform(test_features)
    train_labels_prepared = train_labels.values
    test_labels_prepared = test_labels.values
    
    return (train_features_prepared,test_features_prepared,train_labels_prepared,test_labels_prepared)

In [138]:
train_features,test_features,train_labels,test_labels = data_prepare(data_clean,0.3)

77368 train + 33159 test


## 4. Testing Some Models

A few classifiers are trained below, with 3 cross-validation folds. 

For KNN, sqrt(n_samples) was used as n_neighbors, as usually suggested in literature.


In [141]:
# This Function Runs Several Classifiers on Training Data with 3 cv-folds, Prints Out "metric" score of each Model
 
def testClassifiers(train_features,train_labels,metric):
    LOG=LogisticRegression()
    DTC=DecisionTreeClassifier()
    RFC=RandomForestClassifier()
    KNN=KNeighborsClassifier(n_neighbors=int(np.sqrt(train_labels.size/3).round()))
    SGD=SGDClassifier()
    #KNN=KNeighborsClassifier(n_neighbors=5)
    
    classifier_list = [LOG,DTC,RFC,KNN,SGD]
    
    for c in classifier_list:
        cv_results = cross_validate(c,train_features,train_labels,cv=3,scoring=metric)
        print(type(c).__name__)
        print('-------------------')
        print('Scoring Metric: ', metric)
        print('Scores: ',cv_results['test_score'])
        print('Mean Score: ',np.round(np.mean(cv_results['test_score']),2))
        print('')
    

In [142]:
testClassifiers(train_features,train_labels,'roc_auc')

LogisticRegression
-------------------
Scoring Metric:  roc_auc
Scores:  [0.67481993 0.66651012 0.66848624]
Mean Score:  0.67

DecisionTreeClassifier
-------------------
Scoring Metric:  roc_auc
Scores:  [0.55436244 0.5559     0.55604507]
Mean Score:  0.56

RandomForestClassifier
-------------------
Scoring Metric:  roc_auc
Scores:  [0.64513755 0.64566076 0.64448238]
Mean Score:  0.65

KNeighborsClassifier
-------------------
Scoring Metric:  roc_auc
Scores:  [0.71843994 0.7135111  0.70972577]
Mean Score:  0.71

SGDClassifier
-------------------
Scoring Metric:  roc_auc
Scores:  [0.63682229 0.58291512 0.55214536]
Mean Score:  0.59



K-Neighbors gave the best score so far. 

## 5. Resampling

I tried to resample (under-resampling) the dataset to make use of accuracy as a performance metric.  Excess data in the No-show = "No" class was eliminated until the two classes are split 50/50.

In [143]:
data_clean_yes = data_clean[data_clean['Labels']==1]
data_clean_no = data_clean[data_clean['Labels']==0].sample(yes) # yes/no amounts were calculated in section 1
data_clean_5050 = pd.concat([data_clean_yes,data_clean_no])

Prepare data and test several classifiers on training data as was done before, but looking at accuracy...

In [144]:
train_features5050,test_features5050,train_labels5050,test_labels5050 = data_prepare(data_clean_5050,0.3)

31246 train + 13392 test


In [145]:
testClassifiers(train_features5050,train_labels5050,'accuracy')

LogisticRegression
-------------------
Scoring Metric:  accuracy
Scores:  [0.63786482 0.63139702 0.63264522]
Mean Score:  0.63

DecisionTreeClassifier
-------------------
Scoring Metric:  accuracy
Scores:  [0.60243856 0.60758521 0.60480077]
Mean Score:  0.6

RandomForestClassifier
-------------------
Scoring Metric:  accuracy
Scores:  [0.61443932 0.6131541  0.61641863]
Mean Score:  0.61

KNeighborsClassifier
-------------------
Scoring Metric:  accuracy
Scores:  [0.66042627 0.64896783 0.65472876]
Mean Score:  0.65

SGDClassifier
-------------------
Scoring Metric:  accuracy
Scores:  [0.62461598 0.61325012 0.61757081]
Mean Score:  0.62



Once again, KNN performed the best.  

Next, tuning hyperpameters of KNN?  Or going back and looking more closely at importance of different features.