In [1]:
import pandas as pd
import numpy as np
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv('...KaggleV2-May-2016.csv')

In [3]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [4]:
# Change the names of some of the columns
df = df.rename(columns={'Hipertension':'Hypertension','Handcap':'Handicap','SMS_received':'SMSReceived','No-show':'Showup'})

In [5]:
# Similarly change every No to 1 and Yes to 0 in Showup column
df['Showup'] = df['Showup'].apply(lambda x: 1 if x == 'No' else 0)

In [6]:
# Delete columns that definitely have no contribution to the patients showup
df = df.drop(['PatientId','AppointmentID'],axis=1)

In [7]:
df[:3]

Unnamed: 0,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,Showup
0,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,1
1,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,1
2,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,1


In [8]:
# Convert ScheduledDay and AppointmentDay into 'datetime64[ns]'
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay']).dt.date.astype('datetime64[ns]')
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay']).dt.date.astype('datetime64[ns]')

In [9]:
# Get Day of the Week for ScheduledDay and AppointmentDay
df['ScheduledDOW'] = df['ScheduledDay'].dt.weekday_name
df['AppointmentDOW'] = df['AppointmentDay'].dt.weekday_name

In [10]:
# Make sure there are no negative values in WaitingTimeDays
df['AppointmentDay'] = np.where((df['AppointmentDay'] - df['ScheduledDay']).dt.days < 0, df['ScheduledDay'], df['AppointmentDay'])

# Get the patients waiting time in days
df['WaitingTimeDays'] = df['AppointmentDay'] - df['ScheduledDay']
df['WaitingTimeDays'] = df['WaitingTimeDays'].dt.days

In [11]:
# Divide ScheduledDay and AppointmentDay columns into Year Month and Day columns accordingly
df['ScheduledYear'] = df['ScheduledDay'].dt.year
df['ScheduledMonth'] = df['ScheduledDay'].dt.month
df['ScheduledDay'] = df['ScheduledDay'].dt.day
df.drop(['ScheduledDay'], axis=1, inplace=True)

df['AppointmentYear'] = df['AppointmentDay'].dt.year
df['AppointmentMonth'] = df['AppointmentDay'].dt.month
df['AppointmentDay'] = df['AppointmentDay'].dt.day
df.drop(['AppointmentDay'], axis=1, inplace=True)

In [12]:
df[:3]

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,Showup,ScheduledDOW,AppointmentDOW,WaitingTimeDays,ScheduledYear,ScheduledMonth,AppointmentYear,AppointmentMonth
0,F,62,JARDIM DA PENHA,0,1,0,0,0,0,1,Friday,Friday,0,2016,4,2016,4
1,M,56,JARDIM DA PENHA,0,0,0,0,0,0,1,Friday,Friday,0,2016,4,2016,4
2,F,62,MATA DA PRAIA,0,0,0,0,0,0,1,Friday,Friday,0,2016,4,2016,4


In [13]:
# Check unique entries in the following columns
print('Gender',df.Gender.unique())
print('Scholarship',df.Scholarship.unique())
print('Hypertension',df.Hypertension.unique())
print('Diabetes',df.Diabetes.unique())
print('Alcoholism',df.Alcoholism.unique())
print('Handicap',df.Handicap.unique())
print('SMSReceived',df.SMSReceived.unique())

Gender ['F' 'M']
Scholarship [0 1]
Hypertension [1 0]
Diabetes [0 1]
Alcoholism [0 1]
Handicap [0 1 2 3 4]
SMSReceived [0 1]


In [14]:
# Analyse age column
df.Age.describe()

count    110527.000000
mean         37.088874
std          23.110205
min          -1.000000
25%          18.000000
50%          37.000000
75%          55.000000
max         115.000000
Name: Age, dtype: float64

In [15]:
# Locate the negative Age value
df.Age.argmin()

99832

In [16]:
# Delede the row with the negative age value
df = df.drop([99832],axis=0)

In [17]:
# Check Age column again
df.Age.describe()

count    110526.000000
mean         37.089219
std          23.110026
min           0.000000
25%          18.000000
50%          37.000000
75%          55.000000
max         115.000000
Name: Age, dtype: float64

In [18]:
# Encoding string labels into integers from 0 to n-1
encode = LabelEncoder()
df['Gender'] = encode.fit_transform(df['Gender'])
df['Neighbourhood'] = encode.fit_transform(df['Neighbourhood'])
df['ScheduledDOW'] = encode.fit_transform(df['ScheduledDOW'])
df['AppointmentDOW'] = encode.fit_transform(df['AppointmentDOW'])

In [19]:
df[:3]

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,Showup,ScheduledDOW,AppointmentDOW,WaitingTimeDays,ScheduledYear,ScheduledMonth,AppointmentYear,AppointmentMonth
0,0,62,39,0,1,0,0,0,0,1,0,0,0,2016,4,2016,4
1,1,56,39,0,0,0,0,0,0,1,0,0,0,2016,4,2016,4
2,0,62,45,0,0,0,0,0,0,1,0,0,0,2016,4,2016,4


In [20]:
# Check if any negative values exist in dataframe
df.min()

Gender                 0
Age                    0
Neighbourhood          0
Scholarship            0
Hypertension           0
Diabetes               0
Alcoholism             0
Handicap               0
SMSReceived            0
Showup                 0
ScheduledDOW           0
AppointmentDOW         0
WaitingTimeDays        0
ScheduledYear       2015
ScheduledMonth         1
AppointmentYear     2016
AppointmentMonth       4
dtype: int64

In [21]:
# Check correlation of each column with Showup column, make them a new dataframe
# Ignore the correlation of Showup with itself, ignore the correlations weaker than -/+ 0.001
important_data = []
for i in df:
    if i != 'Showup':
        x = df[i].corr(df['Showup'])
        if x <= -0.001 or x >= 0.001:
            important_data.append(i)
            print(i,' ~ ',x)

Gender  ~  0.004121987742705606
Age  ~  0.06032682272291318
Neighbourhood  ~  0.008957833409124967
Scholarship  ~  -0.029133576642242844
Hypertension  ~  0.035703513860921986
Diabetes  ~  0.01518123291701338
Handicap  ~  0.00607684658243918
SMSReceived  ~  -0.12642794329912407
AppointmentDOW  ~  0.01307457303347002
WaitingTimeDays  ~  -0.18624104494596053
ScheduledYear  ~  0.006168064399485666
ScheduledMonth  ~  0.16085632039567063
AppointmentMonth  ~  0.020879106206829368


In [22]:
# Make input data consisting of the columns from previous filtering
input_data = df
for i in input_data:
    if i not in important_data:
        input_data = input_data.drop([i],axis=1)
        

In [23]:
input_data[:2]

Unnamed: 0,Gender,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Handicap,SMSReceived,AppointmentDOW,WaitingTimeDays,ScheduledYear,ScheduledMonth,AppointmentMonth
0,0,62,39,0,1,0,0,0,0,0,2016,4,4
1,1,56,39,0,0,0,0,0,0,0,2016,4,4


In [24]:
output_data = df['Showup']

In [25]:
# Split both the input_data and the output_data into train and test with 80/20 proportion
X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.20, random_state=0)

In [26]:
# Run MultinomialNB classification
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train, y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
mnb_clf.score(X_test,y_test)

0.7118429385687144

In [27]:
# Run decision tree classification
dt_clf = DecisionTreeClassifier(random_state=0)
dt_clf.fit(X_train, y_train)
print('prediction ~',dt_clf.score(X_test, y_test))

prediction ~ 0.7214330950873066


In [28]:
# Run random forest classification
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train, y_train)
print('prediction ~',rf_clf.score(X_test, y_test))

prediction ~ 0.7567628698091016


In [29]:
# Run random forest classification combined with grid search
params={'n_estimators':[10],'max_depth':[5],'min_samples_split':[5]}
rf_clf = RandomForestClassifier(random_state=0)
clf_grid = GridSearchCV(rf_clf, params, cv=5, n_jobs=-1)
clf_grid.fit(X_train, y_train)
print('prediction ~',clf_grid.best_score_)

prediction ~ 0.7973083012893011
