<a href="https://colab.research.google.com/github/cwcala/sharing-notebooks/blob/master/Copie_de_noshow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
import seaborn as sb
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
import cufflinks
import cufflinks as cf
init_notebook_mode(connected=True)
cufflinks.go_offline(connected=True)
import gc
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
rcParams['figure.figsize']= 5,4
sb.set_style('whitegrid')


In [0]:
noshow = pd.read_csv('/content/KaggleV2-May-2016.csv')
noshow.head()

In [0]:
noshow.columns = ['PatientId','AppointmentId','Gender','ScheduledDate','AppointmentDate','Age','Neighborhood','Scholarship','Hypertension','Diabetes','Alcoholism','Handicap','SMS_received','No-show']

In [0]:
age = noshow.Age
plt.plot(age)

In [0]:
noshow[['Age']].describe()

In [0]:
sb.pairplot(noshow,hue='No-show',palette='hls')

In [0]:
ns = noshow.dropna()

In [0]:
X = ns.iloc[:,2:12].values
y = ns.iloc[:,13].values
ns.head()

In [0]:
noshow.boxplot(return_type='dict')
plt.plot()

In [0]:
Age = X[:,3]
Age_outliers = (Age>100)
noshow[Age_outliers]

In [0]:
sms = X[:10]

In [0]:
ns.isna().sum()

In [0]:
print(f"Unique Patients: {ns.PatientId.nunique()} Appointments: {ns.AppointmentId.nunique()} Avg Appointments per Patient: {int(ns.AppointmentId.nunique())/int(ns.PatientId.nunique())}")

In [0]:
def plotly_plots(df, column, plot_type='bar', title=None, xTitle=None, yTitle=None):
    temp = df[column].value_counts()
    temp.iplot(kind=plot_type, title=title, xTitle=xTitle, yTitle=yTitle)

In [0]:
def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
  init_notebook_mode(connected=False)

In [0]:
enable_plotly_in_cell()
plotly_plots(ns, 'No-show', title='distribution show vs noshow',
             xTitle='No-Show', 
             yTitle='Count')

In [0]:
enable_plotly_in_cell()
fig = go.Figure(
    data=[go.Bar(y=ns['No-show'].value_counts())],
    layout_title_text="No-show data"
)
fig.show()

In [0]:
enable_plotly_in_cell()
fig = px.histogram(ns, x="Age")
fig.show()

In [0]:
ns.Age.describe()

In [0]:
ns.Age.value_counts()

In [0]:
ns = ns[(ns.Age >= 0 ) & (ns.Age <= 102)]

In [0]:
bins= [0,14,24,64,115]
labels = ['Child','Youth','Adult','Senior']
ns['AgeGroup'] = pd.cut(ns['Age'], bins=bins, labels=labels, right=False)

In [0]:
ages = pd.crosstab(ns['AgeGroup'], ns['No-show'], normalize='index')
stacked = ages.unstack().reset_index().rename(columns={0:'value'})
plt.figure(figsize=(16,12))
plt.subplot(211)
ax1 = sb.countplot(x="AgeGroup", data=ns)
ax1.set_title("Count by Age Category", fontsize=22)
ax1.set_xlabel("Age Categories", fontsize=18)
ax1.set_ylabel("Count", fontsize=18)

In [0]:
plt.subplot(212)
ax2 = sb.barplot(x=stacked.AgeGroup, y=stacked.value, hue=stacked['No-show'])
ax2.set_title("No Show by Age Categories", fontsize=22)
ax2.set_xlabel("Age Categories", fontsize=18)
ax2.set_ylabel("Count", fontsize=18)
ax2.legend(loc='out')

In [0]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(20,15))
fig.subplots_adjust(hspace=0.3)
fig.suptitle('No-show for binary features', fontsize=22)
bf = ['Scholarship', 'Hypertension', 'Diabetes','Alcoholism', 'SMS_received', 'Gender']
for ax, catplot in zip(axes.flatten(), ns[bf].columns):
    sb.countplot(x=catplot, data=ns, hue='No-show', ax=ax)
    ax.set_title(catplot.upper(), fontsize=18)
    ax.set_ylabel('Count', fontsize=16)
    ax.set_xlabel(f'{catplot.upper()} binary features', fontsize=15)
    ax.legend(title='No-show', fontsize=12, )

In [0]:
#turn dates into datetime objects
ns['ScheduledDate']= pd.to_datetime(ns['ScheduledDate']) 
ns['AppointmentDate']= pd.to_datetime(ns['AppointmentDate']) 

In [0]:
sms_1 = ns.groupby([ns['AppointmentDate'].dt.date,"SMS_received", "No-show"])['PatientId'].count().reset_index().rename(columns={'PatientId': "Total"})

In [0]:
plt.figure(figsize=(18,16))
plt.subplot(3,1,1)
g = sb.barplot(x='AppointmentDate', y= 'Total', hue='No-show', data=sms_1[sms_1['SMS_received'] == 0])
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("No-shows for patients not getting SMS", fontsize=22)
g.set_xlabel("Dates", fontsize=18)
g.set_ylabel("Count", fontsize=18)
plt.subplot(3,1,2)
g1 = sb.barplot(x='AppointmentDate', y= 'Total', hue='No-show', data=sms_1[sms_1['SMS_received'] == 1])
g1.set_xticklabels(g.get_xticklabels(),rotation=45)
g1.set_title("No-shows for patients getting SMS", fontsize=22)
g1.set_xlabel("Dates", fontsize=18)
g1.set_ylabel("Count", fontsize=18)
plt.subplot(3,1,3)
g2 = sb.boxplot(x='SMS_received', y= 'Age', hue='No-show', data=ns)
g2.set_xticklabels(g2.get_xticklabels(),rotation=0)
g2.set_title("No-shows by SMS and Age", fontsize=22)
g2.set_xlabel("Got SMS or not", fontsize=18)
g2.set_ylabel("Age Distribution", fontsize=18)
plt.subplots_adjust(hspace = 0.6)
plt.show()

In [0]:
appts = ns.groupby([ns['PatientId'],'SMS_received','No-show'])['AppointmentId'].count().reset_index().rename(columns={'PatientId': "Total"})

In [0]:
sms = [1]
nosho = ['Yes']
graf = appts[appts.SMS_received.isin(sms) & 
           appts['No-show'].isin(nosho)].sort_values(['AppointmentId'], ascending=[False])

In [0]:
graf

In [0]:
test = appts.sort_values(['AppointmentId'], ascending=[False])

In [0]:
test

In [0]:
ns.Neighborhood.value_counts()

In [0]:
neigh = pd.crosstab(ns['Neighborhood'], ns['No-show'], normalize='index')
stacked = neigh.unstack().reset_index().rename(columns={0:'value'})
plt.figure(figsize=(16,12))
plt.subplot(211)
ax1 = sb.countplot(x="Neighborhood", data=ns)
ax1.set_title("Count by Neighborhood", fontsize=22)
ax1.set_xlabel("Neghborhood", fontsize=18)
ax1.set_ylabel("Count", fontsize=18)

In [0]:
plt.figure(figsize=(16,12))
ax2 = sb.barplot(x=stacked.Neighborhood, y=stacked.value, hue=stacked['No-show'])
ax2.set_title("No Show by Neighborhood", fontsize=22)
ax2.set_xlabel("Neighborhood", fontsize=18)
ax2.set_ylabel("Count", fontsize=18)
ax2.legend(loc='out')

In [0]:
test = neigh.sort_values('Yes', ascending=False)
test

In [0]:
plt.hist(test.Yes)

In [0]:
neigh1 = ns.groupby([ns['Neighborhood'], "No-show"])['PatientId'].count().reset_index().rename(columns={'PatientId': "Total"})
plt.figure(figsize=(18,16))
plt.subplot(3,1,1)
g = sb.barplot(x='Neighborhood', y= 'Total', hue='No-show', data=neigh1[neigh1['No-show'] == 1])
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("No-shows by Neighborhood", fontsize=22)
g.set_xlabel("Neighborhood", fontsize=18)
g.set_ylabel("Count", fontsize=18)

In [0]:
neigh1

In [0]:
plt.bar(neigh1.Neighborhood,neigh1.Total)

In [0]:
neigh2 = neigh1.loc[neigh1["No-show"] == 'Yes']
neigh2

In [0]:
neigh2.sort_values("Total",ascending=False)

In [0]:
plt.plot(neigh2.Neighborhood,neigh2.Total)


In [0]:
neigh3= neigh1.loc[neigh1["No-show"] == 'No']
neigh3.sort_values("Total",ascending=False)

In [0]:
plt.plot(neigh3.Neighborhood,neigh3.Total)


In [0]:
unineigh = ns['Neighborhood'].unique().tolist()

In [0]:
unineigh

In [0]:
len(unineigh)

listing of neigborhoods and correspondence to administrative regions to classify 81 neighborhoods in 10 administrative regions
https://pt.wikipedia.org/wiki/Lista_de_bairros_de_Vit%C3%B3ria

In [0]:
neighnames = {"CENTRO": 1, "DO MOSCOSO": 1, "FONTE GRANDE":1, "ILHA DO PRÍNCIPE":1, "PARQUE MOSCOSO":1, "PIEDADE":1,"SANTA CLARA":1, "VILA RUBIM":1,"ARIOVALDO FAVALESSA":2, "BELA VISTA":2, "CARATOÍRA":2, "DO CABRAL":2, "DO QUADRO":2, "ESTRELINHA":2,"GRANDE VITÓRIA": 2, "INHANGUETÁ":2, "MÁRIO CYPRESTE":2, "SANTO ANTÔNIO":2, "SANTA TEREZA":2, "UNIVERSITÁRIO":2,"BENTO FERREIRA":3, "CONSOLAÇÃO":3,"CRUZAMENTO":3, "DE LOURDES":3, "FORTE SÃO JOÃO":3, "FRADINHOS":3, "GURIGICA":3, "HORTO":3, "ILHA DE SANTA MARIA":3, "JESUS DE NAZARETH":3, "JUCUTUQUARA":3, "MONTE BELO":3, "NAZARETH":3, "ROMÃO":3,"ANDORINHAS":4, "BONFIM":4, "DA PENHA":4,"ITARARÉ":4, "JOANA D´ARC":4, "MARUÍPE":4, "SANTA CECÍLIA":4, "SANTA MARTHA":4, "SANTOS DUMONT":4,"SÃO BENEDITO":4, "SÃO CRISTÓVÃO":4, "TABUAZEIRO":4,"BARRO VERMELHO":5, "ENSEADA DO SUÁ":5, "ILHA DO BOI":5, "ILHA DO FRADE":5, "PRAIA DO CANTO":5, "PRAIA DO SUÁ":5, "SANTA HELENA":5, "SANTA LÚCIA":5, "SANTA LUÍZA":5,"COMDUSA":7,"CONQUISTA":7, "ILHA DAS CAIEIRAS":7,"NOVA PALESTINA":7, "REDENÇÃO":7,  "RESISTÊNCIA":7, "SANTO ANDRÉ":7, "SANTOS REIS":7, "SÃO JOSÉ":7, "SÃO PEDRO":7, "AEROPORTO":6, "ANTÔNIO HONÓRIO":6, "GOIABEIRAS":6, "JABOUR":6, "MARIA ORTIZ":6, "SEGURANÇA DO LAR":6, "SOLON BORGES":6,"JARDIM CAMBURI": 8, "PARQUE INDUSTRIAL":8, "BOA VISTA": 9, "JARDIM DA PENHA":9, "MATA DA PRAIA":9, "MORADA DE CAMBURI":9, "PONTAL DE CAMBURI":9,"REPÚBLICA":9,"ILHAS OCEÂNICAS DE TRINDADE":10}

In [0]:
len(neighnames)

In [0]:
#check difference between lists of neighborhoods to put them into regions
missing = []
for term in unineigh:
  if term in neighnames:
    pass
  else:
    missing.append(term)
missing

In [0]:
ns['AdminRegion'] = ns['Neighborhood'].map(neighnames)

In [0]:
#group to plot no-shows by Admin Region
ar = pd.crosstab(ns['AdminRegion'], ns['No-show'], normalize='index')
stacked = ar.unstack().reset_index().rename(columns={0:'value'})

In [0]:
plt.subplot(212)
ax2 = sb.barplot(x=stacked.AdminRegion, y=stacked.value, hue=stacked['No-show'])
ax2.set_title("No Show by Administrative Region", fontsize=22)
ax2.set_xlabel("Admistrative Regions", fontsize=18)
ax2.set_ylabel("Count", fontsize=18)
ax2.legend(loc='out')

In [0]:
#group to plot no-shows by month TBD
ar = pd.crosstab(ns['AdminRegion'], ns['No-show'], normalize='index')
stacked = ar.unstack().reset_index().rename(columns={0:'value'})

In [0]:
#group to plot no-shows by month TBD
ar = pd.crosstab(ns['AdminRegion'], ns['No-show'], normalize='index')
stacked = ar.unstack().reset_index().rename(columns={0:'value'})
plt.subplot(212)
ax2 = sb.barplot(x=stacked.AdminRegion, y=stacked.value, hue=stacked['No-show'])
ax2.set_title("No Show by Administrative Region", fontsize=22)
ax2.set_xlabel("Admistrative Regions", fontsize=18)
ax2.set_ylabel("Count", fontsize=18)
ax2.legend(loc='out')

In [0]:
#group to plot no-shows by hour TBD
ar = pd.crosstab(ns['AdminRegion'], ns['No-show'], normalize='index')
stacked = ar.unstack().reset_index().rename(columns={0:'value'})
plt.subplot(212)
ax2 = sb.barplot(x=stacked.AdminRegion, y=stacked.value, hue=stacked['No-show'])
ax2.set_title("No Show by Administrative Region", fontsize=22)
ax2.set_xlabel("Admistrative Regions", fontsize=18)
ax2.set_ylabel("Count", fontsize=18)
ax2.legend(loc='out')

In [0]:
#group to plot no-shows by day of week
wda = pd.crosstab(ns['_weekday_AppointmentDate'], ns['No-show'], normalize='index')
stacked = wda.unstack().reset_index().rename(columns={0:'value'})
plt.subplot(212)
ax2 = sb.barplot(x=stacked._weekday_AppointmentDate, y=stacked.value, hue=stacked['No-show'])
ax2.set_title("No Show by weekday appointment day ", fontsize=22)
ax2.set_xlabel("Weekday of appointment", fontsize=18)
ax2.set_ylabel("Count", fontsize=18)
ax2.legend(loc='out')

In [0]:
#group to plot no-shows by month appt date
mad = pd.crosstab(ns['_month_AppointmentDate'], ns['No-show'], normalize='index')
stacked = mad.unstack().reset_index().rename(columns={0:'value'})
plt.subplot(212)
ax2 = sb.barplot(x=stacked._month_AppointmentDate, y=stacked.value, hue=stacked['No-show'])
ax2.set_title("No Show by month of appointment", fontsize=22)
ax2.set_xlabel("Month of appointment", fontsize=18)
ax2.set_ylabel("Count", fontsize=18)
ax2.legend(loc='out')


In [0]:
#group to plot no-shows by wait
ns['Wait']= (ns['AppointmentDate'] - ns['ScheduledDate']).dt.days
wt = pd.crosstab(ns['Wait'], ns['No-show'], normalize='index')
stacked = wt.unstack().reset_index().rename(columns={0:'value'})
plt.subplot(212)
ax2 = sb.barplot(x=stacked.Wait, y=stacked.value, hue=stacked['No-show'])
ax2.set_title("No Show by wait time to appointment (days)", fontsize=22)
ax2.set_xlabel("Wait time to appointment (days)", fontsize=18)
ax2.set_ylabel("Count", fontsize=18)
ax2.legend(loc='out')

In [0]:
nscopy.head()

In [0]:
#calculating wait
ns1=ns.copy()
ns1['Wait'] = (ns['AppointmentDate'] - ns['ScheduledDate']).dt.days

In [0]:
ns1.head()

In [0]:
ns1.Wait.describe()

In [0]:
ns['Wait'] = (ns['AppointmentDate'] - ns['ScheduledDate']).dt.days

In [0]:
Scheduled = (ns['ScheduledDate'].dt.date.max() - ns['ScheduledDate'].dt.date.min()).days
Appointment = (ns['AppointmentDate'].dt.date.max() - ns['AppointmentDate'].dt.date.min()).days
Wait_range = ns['Wait'].max() - ns['Wait'].min() 
print(f"Schedule range: {Scheduled} days. \n\
        First: {ns['ScheduledDate'].dt.date.min()} \n\
        Last:  {ns['ScheduledDate'].dt.date.max()} \n")
print("*"*50, "\n")
print(f"Appointment range {Appointment} days. \n\
        First: {ns['AppointmentDate'].dt.date.min()} \n\
        Last:  {ns['AppointmentDate'].dt.date.max()}")
print("*"*50, "\n")
print(f"Wait time range: {Wait_range} days. \n\
        Min: {ns['Wait'].min()} \n\
        Max: {ns['Wait'].max()}")

In [0]:
enable_plotly_in_cell()
group_temp = ns[(ns['Wait'] < 70) & (ns['Wait'] >= -1)].groupby(['Wait', 'No-show'])['PatientId'].count() \
                        / ns[(ns['Wait'] < 70) & (ns['Wait'] >= -1)].groupby(['Wait'])['PatientId'].count() 
fig = group_temp.unstack().iplot(kind='bar', barmode='stack', asFigure=True,
                           title='No-shows according to wait (days) for appointment',
                           xTitle='Wait for Appointment (days)', yTitle='No-show %')
fig.layout.xaxis.type = 'category'
iplot(fig)

In [0]:
plt.figure(figsize=(14,6))
sb.countplot(x='Wait', hue='No-show', data=ns[(ns['Wait'] < 10) & (ns['Wait'] >= -1)]) 

-1 wait time is most likely same-day walk-in treatment, thus nearly no no-shows 

In [0]:
ns = ns[(ns['Wait'] >= -1) & (ns['Wait'] <=100)]

In [0]:
enable_plotly_in_cell()
ns.groupby([ns.ScheduledDate.dt.date,'No-show'])['PatientId'].count().unstack().fillna(0).iplot(kind='bar',barmode='stack',title='No-shows by appointment dates', xTitle='Dates', yTitle='Count')


In [0]:
frequentFlyers = ns.groupby(['PatientId'])['AppointmentId'].count().sort_values(ascending=False).head(10) 
ns[ns.PatientId.isin(frequentFlyers.index)]['No-show'].value_counts(normalize=True).plot(kind='bar')

In [0]:
ns.Gender = ns['Gender'].map({"F":0, "M":1})
ns['No-show'] = ns['No-show'].map({"No":0, "Yes":1})

In [0]:
df_dummies = pd.get_dummies(ns['Handicap'], drop_first=True,prefix=(str('Handicap')))
ns = pd.concat([ns, df_dummies], axis=1)
ns.drop('Handicap', axis=1, inplace=True)

In [0]:
ns.head()

In [0]:
ns['ScheduledDate'] = pd.to_datetime(ns['ScheduledDate'])
ns['_weekdayName_'+str('ScheduledDate')] = ns['ScheduledDate'].dt.weekday_name 
ns['_weekday_'+str('ScheduledDate')] = ns['ScheduledDate'].dt.weekday
ns['_day_'+str('ScheduledDate')] = ns['ScheduledDate'].dt.day 
ns['_month_'+str('ScheduledDate')] = ns['ScheduledDate'].dt.month
ns['_hour_'+str('ScheduledDate')] = ns['ScheduledDate'].dt.hour
ns['_minute_'+str('ScheduledDate')] = ns['ScheduledDate'].dt.minute 

# df[col] = df[col].dt.date.astype('datetime64[ns]')

In [0]:
ns['AppointmentDate'] = pd.to_datetime(ns['AppointmentDate'])
ns['_weekdayName_'+str('AppointmentDate')] = ns['AppointmentDate'].dt.weekday_name 
ns['_weekday_'+str('AppointmentDate')] = ns['AppointmentDate'].dt.weekday
ns['_day_'+str('AppointmentDate')] = ns['AppointmentDate'].dt.day 
ns['_month_'+str('AppointmentDate')] = ns['AppointmentDate'].dt.month

# df[col] = df[col].dt.date.astype('datetime64[ns]')

In [0]:
ns.head()

In [0]:
nscopy = ns.copy()

In [0]:
nscopy.drop(['_weekdayName_AppointmentDate', 'AppointmentId', 'PatientId', 'AgeGroup',
                         'ScheduledDate', 'AppointmentDate', '_weekdayName_AppointmentDate',
                         '_weekdayName_ScheduledDate'], axis=1, inplace=True)

In [0]:
nscopy.drop(['Neighborhood'], axis=1, inplace=True)

In [0]:
nscopy.dtypes

In [0]:
nscopy.astype(float).corr()['No-show'].sort_values(ascending=False).head(10)

In [0]:
plt.figure(figsize=(20,15))
plt.title('Feature correlation')
sb.heatmap(nscopy.astype(float).corr(), vmax=1.0 )
plt.show()

In [0]:
y_train = nscopy['No-show']
X_train = nscopy.drop('No-show', axis=1)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.25)

In [0]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier, RandomTreesEmbedding

In [0]:
clfs = []
seed = 3
clfs.append(("LogReg", 
             Pipeline([("Scaler", StandardScaler()),
                       ("LogReg", LogisticRegression())])))
clfs.append(("XGBClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("XGB", XGBClassifier())]))) 
clfs.append(("KNN", 
             Pipeline([("Scaler", StandardScaler()),
                       ("KNN", KNeighborsClassifier())]))) 
clfs.append(("DecisionTreeClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("DecisionTrees", DecisionTreeClassifier())]))) 
clfs.append(("RandomForestClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("RandomForest", RandomForestClassifier())]))) 
clfs.append(("GradientBoostingClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("GradientBoosting", GradientBoostingClassifier(max_features=15, 
                                                                       n_estimators=600))]))) 
clfs.append(("RidgeClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("RidgeClassifier", RidgeClassifier())])))
clfs.append(("BaggingRidgeClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("BaggingClassifier", BaggingClassifier())])))
clfs.append(("ExtraTreesClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("ExtraTrees", ExtraTreesClassifier())])))
scoring = 'accuracy'
n_folds = 10
results, names  = [], [] 
for name, model  in clfs:
    kfold = KFold(n_splits=n_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, 
                                 cv=kfold, scoring=scoring, n_jobs=-1)    
    names.append(name)
    results.append(cv_results)    
    msg = "%s: %f (+/- %f)" % (name, cv_results.mean(),  
                               cv_results.std())
    print(msg)
fig = plt.figure(figsize=(15,6))
fig.suptitle('Classifier Algorithm Comparison', fontsize=22)
ax = fig.add_subplot(111)
sb.boxplot(x=names, y=results)
ax.set_xticklabels(names)
ax.set_xlabel("Algorithmn", fontsize=20)
ax.set_ylabel("Accuracy of Models", fontsize=18)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()

In [0]:
import scipy as sp 
from sklearn.model_selection import RandomizedSearchCV
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial
from sklearn.metrics import confusion_matrix

In [0]:
from sklearn.model_selection import StratifiedKFold

def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'gamma': "{:.3f}".format(params['gamma']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = XGBClassifier(
        n_estimators=600,
        n_jobs=-1,
        **params
    )

    score = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=StratifiedKFold()).mean()
    print("Accuracy {:.8f} params {}".format(-score, params))
    return -score

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.7, 1.0),
    'learning_rate': hp.uniform('learning_rate', 0.05, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50)

In [0]:
best['max_depth'] = int(best['max_depth'])
print("BEST PARAMS: ", best)

In [0]:
clf = XGBClassifier(n_estimators=5000,n_jobs=-1,**best)

In [0]:
clf.fit(X_train, y_train)

In [0]:
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import accuracy_score
pred = clf.predict(X_val)

In [0]:
print(f'Accuracy of Classifier with best Hyper Parameeters: {round(accuracy_score(y_val, pred, normalize=True),4)}')

In [0]:
class_names = nscopy['No-show'].unique()
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix: ")
    else:
        print('Confusion matrix, without normalization: ')

    print(cm)
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plot_confusion_matrix(y_val, pred, classes=class_names,
                      title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plot_confusion_matrix(y_val, pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')
plt.show()

In [0]:
sb.countplot(x='No-show', data=ns,palette='hls')

In [0]:
noshow.head()

In [0]:
noshow['scheduledDate'] = pd.to_datetime(noshow['ScheduledDay']).dt.date
noshow['scheduledTime'] = pd.to_datetime(noshow['ScheduledDay']).dt.time

In [0]:
noshow.head()

In [0]:
noshow['appointmentDate'] = pd.to_datetime(noshow['AppointmentDay']).dt.date
noshow['appointmentTime'] = pd.to_datetime(noshow['AppointmentDay']).dt.time

In [0]:
noshow.head()

In [0]:
noshow['scheduledDate'] = pd.to_datetime(noshow['scheduledDate'])
noshow['appointmentDate'] = pd.to_datetime(noshow['appointmentDate'])
noshow.dtypes

In [0]:
#noshow['scheduledTime'] = pd.to_datetime(noshow.scheduledTime, format='%H:%M:%S').dt.time[0]
#noshow['appointmentTime'] = pd.to_datetime(noshow.appointmentTime, format='%H:%M:%S').dt.time[0]
#noshow.dtypes

In [0]:
noshow['scheduledHour'] = pd.to_datetime(noshow['scheduledTime'], format='%H:%M:%S').dt.hour
noshow.head()

In [0]:
 noshow['appt_to_schedule_time'] = noshow['appointmentDate'] - noshow['scheduledDate']

In [0]:
noshow['appointmentMonth'] = pd.to_datetime(noshow['appointmentDate'], format='%Y:%M:%D').dt.month
noshow.head()

In [0]:
bins= [0,14,24,64,115]
labels = ['Child','Youth','Adult','Senior']
noshow['AgeGroup'] = pd.cut(noshow['Age'], bins=bins, labels=labels, right=False)
noshow.head()

In [0]:
bins_timeofday= [0,6,12,18,24]
labels_timeofday = ['night','morning','afternoon','evening']
noshow['timeofday'] = pd.cut(noshow['scheduledHour'], bins=bins_timeofday, labels=labels_timeofday, right=False)
noshow.head()

In [0]:
bins_timeofyear= [0,3,5,9,12]
labels_timeofyear = ['1Q','2Q','3Q','4Q']
noshow['appointmentQuarter'] = pd.cut(noshow['appointmentMonth'], bins=bins_timeofyear, labels=labels_timeofyear, right=False)
noshow.head()

In [0]:
obj_df = noshow.select_dtypes(include=['object']).copy()
obj_df.head()

In [0]:
obj_df[obj_df.isnull().any(axis=1)]

In [0]:
obj_df.Neighbourhood.unique()

In [0]:
noshow1 = noshow.drop(['PatientId','AppointmentID'], axis=1)
noshow1.head()

In [0]:
noshow1.isnull().sum()

In [0]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
gender_cat = noshow1['Gender']
gender_encoded = label_encoder.fit_transform(gender_cat)
gender_encoded[0:5]

In [0]:
gender_DF = pd.DataFrame(gender_encoded,columns=['male_gender'])
gender_DF.head()

In [0]:
noshow1['No-show'] = noshow1['No-show'].map({'Yes': 1, 'No': 0})

In [0]:
Neighbourhood_cat = noshow1['Neighbourhood']
Neighbourhood_encoded = label_encoder.fit_transform(Neighbourhood_cat)
Neighbourhood_encoded[0:100]

In [0]:
colnames = ['JARDIM DA PENHA', 'MATA DA PRAIA', 'PONTAL DE CAMBURI',
       'REPÚBLICA', 'GOIABEIRAS', 'ANDORINHAS', 'CONQUISTA',
       'NOVA PALESTINA', 'DA PENHA', 'TABUAZEIRO', 'BENTO FERREIRA',
       'SÃO PEDRO', 'SANTA MARTHA', 'SÃO CRISTÓVÃO', 'MARUÍPE',
       'GRANDE VITÓRIA', 'SÃO BENEDITO', 'ILHA DAS CAIEIRAS',
       'SANTO ANDRÉ', 'SOLON BORGES', 'BONFIM', 'JARDIM CAMBURI',
       'MARIA ORTIZ', 'JABOUR', 'ANTÔNIO HONÓRIO', 'RESISTÊNCIA',
       'ILHA DE SANTA MARIA', 'JUCUTUQUARA', 'MONTE BELO',
       'MÁRIO CYPRESTE', 'SANTO ANTÔNIO', 'BELA VISTA', 'PRAIA DO SUÁ',
       'SANTA HELENA', 'ITARARÉ', 'INHANGUETÁ', 'UNIVERSITÁRIO',
       'SÃO JOSÉ', 'REDENÇÃO', 'SANTA CLARA', 'CENTRO', 'PARQUE MOSCOSO',
       'DO MOSCOSO', 'SANTOS DUMONT', 'CARATOÍRA', 'ARIOVALDO FAVALESSA',
       'ILHA DO FRADE', 'GURIGICA', 'JOANA D´ARC', 'CONSOLAÇÃO',
       'PRAIA DO CANTO', 'BOA VISTA', 'MORADA DE CAMBURI', 'SANTA LUÍZA',
       'SANTA LÚCIA', 'BARRO VERMELHO', 'ESTRELINHA', 'FORTE SÃO JOÃO',
       'FONTE GRANDE', 'ENSEADA DO SUÁ', 'SANTOS REIS', 'PIEDADE',
       'JESUS DE NAZARETH', 'SANTA TEREZA', 'CRUZAMENTO',
       'ILHA DO PRÍNCIPE', 'ROMÃO', 'COMDUSA', 'SANTA CECÍLIA',
       'VILA RUBIM', 'DE LOURDES', 'DO QUADRO', 'DO CABRAL', 'HORTO',
       'SEGURANÇA DO LAR', 'ILHA DO BOI', 'FRADINHOS', 'NAZARETH',
       'AEROPORTO', 'ILHAS OCEÂNICAS DE TRINDADE', 'PARQUE INDUSTRIAL']

In [0]:
from sklearn.preprocessing import OneHotEncoder
binary_encoder= OneHotEncoder(categories='auto')
Neighbourhood_1hot = binary_encoder.fit_transform(Neighbourhood_encoded.reshape(-1,1))
Neighbourhood_1hot_mat = Neighbourhood_1hot.toarray()
Neighbourhood_DF = pd.DataFrame(Neighbourhood_1hot_mat,columns=colnames)
Neighbourhood_DF.head()

In [0]:
noshow1.drop(['Gender','Neighbourhood'], axis=1, inplace=True)
noshow1.head()

In [0]:
noshow1_dmy = pd.concat([noshow1,gender_DF, Neighbourhood_DF], axis=1, verify_integrity=True)
noshow1_dmy[0:5]

In [0]:
noshow1_dmy.dtypes

In [0]:
sb.heatmap(noshow1_dmy.corr())

In [0]:
noshow1_dmy.drop('ScheduledDay',axis=1)

In [0]:
noshow1_dmy.drop('AppointmentDay',axis=1)

In [0]:
curt = noshow1_dmy.drop('AppointmentDay',axis=1)
curta = curt.drop('ScheduledDay',axis=1)
X_train, X_test, y_train, y_test = train_test_split(curta.drop('No-show',axis=1),noshow1_dmy['No-show'],test_size=0.2,random_state=200)

In [0]:
print(X_train.shape)
print(y_train.shape)

In [0]:
X_train[0:5]

In [0]:
LogReg = LogisticRegression(solver = 'liblinear')
LogReg.fit(X_train, y_train)

In [0]:
y_pred = LogReg.predict(X_test)

In [0]:
print(classification_report(y_test,y_pred))

In [0]:
y_train_pred = cross_val_predict(LogReg,X_train, y_train,cv=5)
confusion_matrix(y_train,y_train_pred)

In [0]:
precision_score(y_train, y_train_pred)

In [0]:
metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

In [0]:
noshow1_dmy[863:864]

In [0]:
test_noshow = np.array()

In [0]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [0]:
categorical_feature_mask = obj_df.dtypes==object
categorical_cols = obj_df.columns[categorical_feature_mask].tolist()

In [0]:
obj_df[categorical_cols] = obj_df[categorical_cols].apply(lambda col: le.fit_transform(col))
obj_df[categorical_cols].head(10)

In [0]:
from sklearn.preprocessing import OneHotEncoder 
onehotencoder = OneHotEncoder(handle_unknown='ignore') 
obj_df = onehotencoder.fit_transform(obj_df).toarray() 

In [0]:
X_dict = noshow.to_dict(orient='records') 
X_dict

In [0]:
from sklearn.feature_extraction import DictVectorizer
dv_X = DictVectorizer(sparse=False) 

In [0]:
vocab = dv_X.vocabulary_
vocab

In [0]:
X_encoded = dv_X.fit_transform(X_dict)
X_encoded

In [0]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False) 

In [0]:
X_ohe = ohe.fit_transform(obj_df) 

In [0]:
print(X_ohe)

In [0]:
categories = {"Gender": {"F": 0, "M": 1}, "No-Show": {"Yes": 1, "No": 0,}, "Neighborhood": {"CENTRO": 1, "DO MOSCOSO": 1, "FONTE GRANDE":1, "ILHA DO PRÍNCIPE":1, "PARQUE MOSCOSO":1, "PIEDADE":1,"SANTA CLARA":1, "VILA RUBIM":1,"ARIOVALDO FAVALESSA":2, "BELA VISTA":2, "CARATOÍRA":2, "DO CABRAL":2, "DO QUADRO":2, "ESTRELINHA":2,"GRANDE VITÓRIA": 2, "INHANGUETÁ":2, "MÁRIO CYPRESTE":2, "SANTO ANTÔNIO":2, "SANTA TEREZA":2, "UNIVERSITÁRIO":2,"BENTO FERREIRA":3, "CONSOLAÇÃO":3,"CRUZAMENTO":3, "DE LOURDES":3, "FORTE SÃO JOÃO":3, "FRADINHOS":3, "GURIGICA":3, "HORTO":3, "IHLA DE SANTA MARIA":3, "JESUS DE NAZARETH":3, "JUCUTUQUARA":3, "MONTE BELO":3, "NAZARETH":3, "ROMÃO":3,"ADORINHAS":4, "BONFIM":4, "DA PENHA":4,"ITARARÉ":4, "JOANA D´ARC":4, "MARUÍPE":4, "SANTA CECÍLIA":4, "SANTA MARTHA":4, "SANTOS DUMONT":4,"SÃO BENEDITO":4, "SÃO CRISTÓVÃO":4, "TABUAZEIRO":4,"BARRO VERMELHO":5, "ENSEADA DO SUÁ":5, "ILHA DO BOI":5, "ILHA DO FRADE":5, "PRAIA DO CANTO":5, "PRAIA DO SUÁ":5, "SANTA HELENA":5, "SANTA LÚCIA":5, "SANTA LUÍZA":5,"COMDUSA":7,"CONQUISTA":7, "ILHA DAS CAIEIRAS":7,"NOVA PALESTINA":7, "REDENÇÃO":7,  "RESISTÊNCIA":7, "SANTO ANDRÉ":7, "SANTOS REIS":7, "SÃO JOSÉ":7, "SÃO PEDRO":7, "AEROPORTO":6, "ANTÔNIO HONÓRIO":6, "GOIABEIRAS":6, "JABOUR":6, "MARIA ORTIZ":6, "SEGURANÇA DO LAR":6, "SOLON BORGES":6,"JARDIM CAMBURI": 8, "PARQUE INDUSTRIAL":8, "BOA VISTA": 9, "JARDIM DA PENHA":9, "MATA DA PRAIA":9, "MORADA DE CAMBURI":9, "PONTAL DE CAMBURI":9,"REPÚBLICA":9}}
obj_df.replace(categories, inplace=True)
obj_df.head()

In [0]:
noshow.['No-show'] = (noshow['No-show'] == 'Male').astype(int)