In [124]:
# usual imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline
import datetime as dt
from  scipy.stats import chisquare
from scipy.stats import chi2_contingency


# stats tests 
from scipy.stats import ttest_ind

# ml imports
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
# read in the data
data = pd.read_csv("KaggleV2-May-2016.csv", parse_dates=["AppointmentDay", "ScheduledDay"])

### Initial data cleaning 

In [3]:
# we see we have no missing values Nan's
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null datetime64[ns]
AppointmentDay    110527 non-null datetime64[ns]
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: datetime64[ns](2), float64(1), int64(8), object(3)
memory usage: 11.8+ MB


In [4]:
# we see that in age we have a negative one. We will filter that out first. 
data.describe()

Unnamed: 0,PatientId,AppointmentID,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
count,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0
mean,147496300000000.0,5675305.0,37.088874,0.098266,0.197246,0.071865,0.0304,0.022248,0.321026
std,256094900000000.0,71295.75,23.110205,0.297675,0.397921,0.258265,0.171686,0.161543,0.466873
min,39217.84,5030230.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172614000000.0,5640286.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31731840000000.0,5680573.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94391720000000.0,5725524.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999981600000000.0,5790484.0,115.0,1.0,1.0,1.0,1.0,4.0,1.0


In [5]:
# filter out negative age
data = data[data.Age >=0]

In [6]:
# This turns the No-show variable into a binary of true or false where true means a no-show.  
data['no_show_bi'] = data["No-show"]== 'Yes'

In [7]:
# Create a column that is the day of the week of the appointment 
data["appointment_day"] = data["AppointmentDay"].dt.weekday_name

In [8]:
# Wednesday is the most common day of the week for an appointment, and Saturday is the least common. 
data["appointment_day"].value_counts()

Wednesday    25867
Tuesday      25640
Monday       22714
Friday       19019
Thursday     17247
Saturday        39
Name: appointment_day, dtype: int64

In [9]:
# Remove the Saturday appointments as they are clearly outliers. 
data= data[data["appointment_day"] != "Saturday"]

In [10]:
# create a column for the time between when a patient scheduled the appointment to the appointment.
data["wait"] = (data["AppointmentDay"].dt.date - data["ScheduledDay"].dt.date)

In [11]:
# we see there are 5 appointments that seem to have a negative wait time. we will drop these next. 
data[data['wait'] < "0 days"]

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,no_show_bi,appointment_day,wait
27033,7839273000000.0,5679978,M,2016-05-10 10:51:53,2016-05-09,38,RESISTÊNCIA,0,0,0,0,1,0,Yes,True,Monday,-1 days
55226,7896294000000.0,5715660,F,2016-05-18 14:50:41,2016-05-17,19,SANTO ANTÔNIO,0,0,0,0,1,0,Yes,True,Tuesday,-1 days
64175,24252260000000.0,5664962,F,2016-05-05 13:43:58,2016-05-04,22,CONSOLAÇÃO,0,0,0,0,0,0,Yes,True,Wednesday,-1 days
71533,998231600000000.0,5686628,F,2016-05-11 13:49:20,2016-05-05,81,SANTO ANTÔNIO,0,0,0,0,0,0,Yes,True,Thursday,-6 days
72362,3787482000000.0,5655637,M,2016-05-04 06:50:57,2016-05-03,7,TABUAZEIRO,0,0,0,0,0,0,Yes,True,Tuesday,-1 days


In [12]:
# drop negative wait time rows. 
data = data[data['wait'] >= "0 days"]

In [13]:
# We see that a few neighbourhoods have very few appointments. 
# Neighboourhoods with less than 50 appointmnets are to be viewed as outliers. 
data.Neighbourhood.value_counts().tail(10)

UNIVERSITÁRIO                  152
SEGURANÇA DO LAR               145
NAZARETH                       135
MORADA DE CAMBURI               96
PONTAL DE CAMBURI               69
ILHA DO BOI                     35
ILHA DO FRADE                   10
AEROPORTO                        8
ILHAS OCEÂNICAS DE TRINDADE      2
PARQUE INDUSTRIAL                1
Name: Neighbourhood, dtype: int64

In [14]:
data.Neighbourhood.nunique()

81

In [15]:
# remove outlier neighbourhoods
remove_list = ["ILHA DO BOI", "ILHA DO FRADE", "AEROPORTO", "ILHAS OCEÂNICAS DE TRINDADE","PARQUE INDUSTRIAL"]
data= data[~data.Neighbourhood.isin(remove_list)]

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110426 entries, 0 to 110526
Data columns (total 17 columns):
PatientId          110426 non-null float64
AppointmentID      110426 non-null int64
Gender             110426 non-null object
ScheduledDay       110426 non-null datetime64[ns]
AppointmentDay     110426 non-null datetime64[ns]
Age                110426 non-null int64
Neighbourhood      110426 non-null object
Scholarship        110426 non-null int64
Hipertension       110426 non-null int64
Diabetes           110426 non-null int64
Alcoholism         110426 non-null int64
Handcap            110426 non-null int64
SMS_received       110426 non-null int64
No-show            110426 non-null object
no_show_bi         110426 non-null bool
appointment_day    110426 non-null object
wait               110426 non-null timedelta64[ns]
dtypes: bool(1), datetime64[ns](2), float64(1), int64(8), object(4), timedelta64[ns](1)
memory usage: 14.4+ MB


## Implement Binning System 
### Age Binning

In [17]:
def bin_age(df):
    '''Creates a function to turn age into 6 groups of binned ages.'''
                        # bin ranges 0-3, 4-7, 8-27, 28-40, 41-60, 61 < 
    if df["Age"] <4:
        return 2
    elif df["Age"] <8:
        return 5
    elif df["Age"] <28:
        return 16
    elif df["Age"] <41:
        return 34
    elif df["Age"] <61:
        return 51
    else:
        return 70
    

In [18]:
# apply binning function
data["Age_Binned"]= data.apply(bin_age, axis=1)

In [19]:
data["Age_Binned"].value_counts()

51    30052
16    26566
70    19732
34    19398
2      8943
5      5735
Name: Age_Binned, dtype: int64

#### Totals of the age bins.  
+ 8943 patients were 0-3 years of age. 
+ 5735 patients were 4-7 years of age. 
+ 26566 patients were 8-27 years of age. 
+ 19398 patients were 28-40 years of age. 
+ 19732 patients were 41-60 years of age. 
+ 30052 patients were 61+ years of age. 

### Wait time binning

In [20]:
# turn wait time from time delta to an int
data["days_waiting"] = data["wait"].dt.days

In [21]:
def bin_wait_time(df):
    '''Creates a function to turn wait_time into 5 groups of binned wait times .'''
                        # bin ranges 0, 1, 2-4, 5-9, 10 < 
    col="days_waiting"
    if df[col]   <1:
        return 0
    elif df[col] <2:
        return 1
    elif df[col] <5:
        return 3
    elif df[col] <10:
        return 7
    else:
        return 15

In [22]:
# Apply the binning function. 
data["Wait_Binned"]= data.apply(bin_wait_time, axis=1)

In [23]:
data.Wait_Binned.value_counts()

0     38536
15    35816
7     16145
3     14723
1      5206
Name: Wait_Binned, dtype: int64

### Totals of the waiting bins. 
+ 38536 patients had a wait of 0   days. 
+ 5206  patients had a wait of  1  day.
+ 14723 patients had a wait of 2-4 days.
+ 16145 patients had a wait of 5-9 days.
+ 35816 patients had a wait of 10+ days. 


# MACHINE LEARNING SECTION

Mentor notes: 
+ need to get data into correct format 
+ split / cross validation change –  logistic regression cv. 
+ create a machine learning pipeline -  MLP 
+ logistic regression. – over sampled / undersampled  ROV curve area under curve Grid search. 
+ knn
+ Random forest 
+ SVM - support vector machine 
+ Get some results full machine learning from begining to end. 


In [26]:
data.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show',
       'no_show_bi', 'appointment_day', 'wait', 'Age_Binned', 'days_waiting',
       'Wait_Binned'],
      dtype='object')

In [24]:
data.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,no_show_bi,appointment_day,wait,Age_Binned,days_waiting,Wait_Binned
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No,False,Friday,0 days,70,0,0
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No,False,Friday,0 days,51,0,0
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No,False,Friday,0 days,70,0,0
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,False,Friday,0 days,16,0,0
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No,False,Friday,0 days,51,0,0


## Wrangle the data into a form where ml can be applied

#### Run only one of the next two cells. 

In [127]:
# Run cell to run ML on all statistically significant predictor variables. 
X_create= data.set_index("AppointmentID")[['Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes','SMS_received','appointment_day', 'Age_Binned', 'Wait_Binned']]
X_create["Age_Binned"]=X_create["Age_Binned"].astype(str)
X_create["Wait_Binned"]=X_create["Wait_Binned"].astype(str)

y_create = data.set_index("AppointmentID")[["No-show"]]
y_create["No-show"][y_create["No-show"]=="Yes"] = 1
y_create["No-show"][y_create["No-show"]=="No"]  = 0

X_create.head()

Unnamed: 0_level_0,Neighbourhood,Scholarship,Hipertension,Diabetes,SMS_received,appointment_day,Age_Binned,Wait_Binned
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5642903,JARDIM DA PENHA,0,1,0,0,Friday,70,0
5642503,JARDIM DA PENHA,0,0,0,0,Friday,51,0
5642549,MATA DA PRAIA,0,0,0,0,Friday,70,0
5642828,PONTAL DE CAMBURI,0,0,0,0,Friday,16,0
5642494,JARDIM DA PENHA,0,1,1,0,Friday,51,0


In [135]:
# Run cell to run ML on only the best predictor variables.
X_create= data.set_index("AppointmentID")[["SMS_received","Age_Binned", "Wait_Binned","Neighbourhood"]]
X_create["Age_Binned"]=X_create["Age_Binned"].astype(str)
X_create["Wait_Binned"]=X_create["Wait_Binned"].astype(str)
y_create = data.set_index("AppointmentID")[["No-show"]]
y_create["No-show"][y_create["No-show"]=="Yes"] = 1
y_create["No-show"][y_create["No-show"]=="No"]  = 0
X_create.head()

Unnamed: 0_level_0,SMS_received,Age_Binned,Wait_Binned,Neighbourhood
AppointmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5642903,0,70,0,JARDIM DA PENHA
5642503,0,51,0,JARDIM DA PENHA
5642549,0,70,0,MATA DA PRAIA
5642828,0,16,0,PONTAL DE CAMBURI
5642494,0,51,0,JARDIM DA PENHA


In [136]:
X = pd.get_dummies(X_create, drop_first=True).values
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [137]:
# This puts the y values which are our targets into the correct format for ml 
y = np.array(list(y_create["No-show"]))
y

array([0, 0, 0, ..., 0, 0, 0])

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

### Logistic Regression 

In [131]:
# run with all variables.  -- slightly slightly better AUC_ROC max than with the top variables. 
searchCV = LogisticRegressionCV(cv=10)
searchCV.fit(X_train, y_train)
print ('Max auc_roc:', searchCV.scores_[1].max())

Max auc_roc: 0.798744416274


In [132]:
# run with all variables. 
searchCV.score(X_test,y_test)

0.79860180389031765

In [139]:
# run with top variables. 
searchCV = LogisticRegressionCV(cv=10)
searchCV.fit(X_train, y_train)
print ('Max auc_roc:', searchCV.scores_[1].max())

Max auc_roc: 0.798695809685


In [140]:
# run with top variables. 
searchCV.score(X_test,y_test)

0.79860180389031765

#### Trying to Implement ml pipeline 

#### from DATA CAMP FOR AN EXAMPLE. #### 
######  Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())]
        
######  Create the pipeline: pipeline
pipeline = Pipeline(steps)

######  Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

###### Fit the pipeline to the training set: knn_scaled
knn_scaled = pipeline.fit(X_train,y_train)

###### Instantiate and fit a k-NN classifier to the unscaled data
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)

2nd EXAMPLE from DATA CAMP 
#####  Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

##### Specify the hyperparameter space

parameters = {'SVM__C':[1, 10, 100], 'SVM__gamma':[0.1, 0.01]}

#####  Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2 , random_state=21)

#####  Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, parameters)

##### Fit to the training set
cv.fit(X_train,y_train)

#####  Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

###### Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))

print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

## Impliment K-nearest neighbors Classifier

In [120]:
# run with the top variables 
knn = KNeighborsClassifier().fit(X_train, y_train)
knn.score(X_test,y_test)

0.7661462672510595

In [133]:
# Run with all reasonable variables : this takes much longer with more variables. - slightly better score with all links
knn = KNeighborsClassifier().fit(X_train, y_train)
knn.score(X_test,y_test)

0.76875430144528567

## Support Vector Machine 

In [123]:
# this took a very long time to run... over 9000!!!!!!  ( Ben- what is wrong with this....? ) or does it just take minutes... 
# this is way slower than all the other ones. 
steps = [("SVM", SVC())]
pipeline= Pipeline(steps)
pipeline.fit(X_train, y_train)
# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

# Compute metrics
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.80      1.00      0.89     22047
          1       0.00      0.00      0.00      5560

avg / total       0.64      0.80      0.71     27607



  'precision', 'predicted', average, warn_for)


## Random Forest Classifier

In [125]:
# run with top predictors 
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.79037925163907707

In [134]:
# run with all predictors -- This is worse with all predictors.
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.76259644293114071