In [None]:
import plotly.express as px
import pandas as pd
import seaborn as sns
import numpy as np
import plotly
px.defaults.template = 'seaborn'
px.defaults.width = 700
px.defaults.height = 500

In [None]:
df = pd.read_csv('hospital_no_show.csv')

In [None]:
df['No-show'].value_counts()

In [None]:
df.sample(10)

In [None]:
df.nunique()

In [None]:
px.pie(df, names='No-show', color='No-show', hole=0.3, title='Data Distribution')

In [None]:
px.histogram(df, x='No-show', color='No-show', title='Number of data observations')

In [None]:
px.box(df, y='No-show', x='Age', color='No-show', title='Relationship with Age')

In [None]:
fig = px.histogram(df, color='No-show', x='Age', title='Relationship with Age', barnorm='fraction')
fig.show()

In [None]:
def percent_calc(x):
    value_counts = x['No-show'].value_counts()
    value_counts = value_counts*100/value_counts.sum()
    return value_counts

cols_categorical = ['Gender', 'Scholarship','Hipertension','Diabetes',
                    'Alcoholism','Handcap','SMS_received']
for col in cols_categorical:
    fig = px.bar(df.groupby(col).apply(percent_calc), title='Relationship with Gender').update_yaxes(title_text='Percent')
    fig.show()

In [None]:
df_loc = pd.crosstab(df['Neighbourhood'], df['No-show'])
total = df_loc['No'] + df_loc['Yes']
df_loc['No'] = df_loc['No']*100/total
df_loc['Yes'] = df_loc['Yes']*100/total
df_loc
px.bar(df_loc).update_yaxes(title_text='Percent')

In [None]:
df_loc = pd.crosstab(df['Neighbourhood'], df['No-show'])
total = df_loc['No'] + df_loc['Yes']
df_loc
px.bar(df_loc).update_yaxes(title_text='Count')

In [None]:
df = pd.read_csv('hospital_no_show.csv')

In [None]:
date_delta = pd.to_datetime(df.AppointmentDay) - pd.to_datetime(df.ScheduledDay)
date_delta = date_delta.dt.days
df['date_delta'] = date_delta
filt = df['date_delta'] < 0
df.loc[filt,'date_delta'] = 0
df['day'] = pd.to_datetime(df.AppointmentDay).dt.day_name()
df['month'] = pd.to_datetime(df.AppointmentDay).dt.month_name()

def plot_bar_corr(x, percent=True):
    df_loc = pd.crosstab(df[x], df['No-show'])
    if percent==True:
        total = df_loc['No'] + df_loc['Yes']
        df_loc['No'] = df_loc['No']*100/total
        df_loc['Yes'] = df_loc['Yes']*100/total
        fig = px.bar(df_loc).update_yaxes(title_text='Percent')
    else:
        fig = px.bar(df_loc).update_yaxes(title_text='Count')
    fig.show()

plot_bar_corr('date_delta')
plot_bar_corr('date_delta', percent=False)

In [None]:
plot_bar_corr('day')
plot_bar_corr('day', percent=False)

In [None]:
plot_bar_corr('month')
plot_bar_corr('month', percent=False)

In [None]:
def occurance_duplicates(df, idx=None, order=None):
    df_copy = df.copy()
    df_copy.sort_values(by=order, inplace=True)
    df_copy.reset_index(inplace=True, drop=True)
    df_copy['No-show'].replace({'No': 0, 'Yes': 1}, inplace=True)
    #df_copy['occurance'] = df_copy.groupby(idx).apply(lambda x: np.arange(1,len(x)+1)).explode().values
    #df_copy.sort_index(inplace=True)
    def add_occurance(x):
        x['occurance'] = np.arange(len(x))
        #x['cumsum'] = np.cumsum(x['No-show'])/np.arange(1, len(x)+1)
        x['cumsum'] = np.concatenate([np.array([np.nan]),np.cumsum(x['No-show'].values)[:-1] ])/np.arange(len(x))
        return x
    df_copy = df_copy.groupby(idx).apply(add_occurance)
    return df_copy
df['appt_date'] = pd.to_datetime(df.AppointmentDay)
df_occurance = occurance_duplicates(df, idx='PatientId', order='appt_date')

In [None]:
df_occurance['No-show'] = df_occurance['No-show'].replace({0: 'No', 1:'Yes'})
def plot_bar_corr(df, x, percent=True):
    df_loc = pd.crosstab(df[x], df['No-show'])
    if percent==True:
        total = df_loc['No'] + df_loc['Yes']
        df_loc['No'] = df_loc['No']*100/total
        df_loc['Yes'] = df_loc['Yes']*100/total
        fig = px.bar(df_loc).update_yaxes(title_text='Percent')
    else:
        fig = px.bar(df_loc).update_yaxes(title_text='Count')
    fig.show()

plot_bar_corr(df_occurance,'occurance')
plot_bar_corr(df_occurance,'occurance', percent=False)

In [None]:
filt = (df_occurance['cumsum'] < 1) & (df_occurance['cumsum'] > 0)
px.box(df_occurance.loc[filt,:], x='No-show', y='cumsum', color='No-show', title='Relationship with past probability')

In [None]:
df_model = df_occurance.copy()
df_model.drop(['PatientId', 'AppointmentID', 'Neighbourhood', 'appt_date'], inplace=True, axis=1)

df_model['date'] = pd.to_datetime(df_model.AppointmentDay).dt.day

df_model['date_delta'] = (pd.to_datetime(df_model.AppointmentDay) - pd.to_datetime(df_model.ScheduledDay)).dt.days
filt = df_model['date_delta'] < 0
df_model.loc[filt,'date_delta'] = 0

df_model['day'] = pd.to_datetime(df_model.AppointmentDay).dt.day_name()

df_model.drop(['ScheduledDay', 'AppointmentDay'], inplace=True, axis=1)

y = df_model.pop('No-show')
y = y.replace({'No': 0, 'Yes': 1})
X = df_model.copy()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#%% create dataflow and model pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report

oh_columns = ['Gender', 'day', 'month']
imp_columns = ['cumsum']

imp_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('impute_scale', StandardScaler())
])

num_columns = ['Age', 'occurance', 'date', 'date_delta', 'Handcap']

ct = ColumnTransformer([('imputer', imp_transformer, imp_columns),
                        ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'), oh_columns),
                        ('scale',StandardScaler(), num_columns)], remainder='passthrough')

pipeline = Pipeline([('column_transform', ct),
                     ('model', ExtraTreesClassifier())])

pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train))
print(classification_report(y_train, pipeline.predict(X_train)>0.5))

print(pipeline.score(X_test, y_test))
print(classification_report(y_test, pipeline.predict(X_test)>0.5))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
cm = confusion_matrix(y_test, pipeline.predict(X_test)>0.5)
sns.heatmap(cm, annot=True , fmt='d', cmap='Blues')
plt.ylabel('actual')
plt.xlabel('prediction')

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
cm = confusion_matrix(y_train, pipeline.predict(X_train)>0.5)
sns.heatmap(cm, annot=True , fmt='d', cmap='Blues')
plt.ylabel('actual')
plt.xlabel('prediction')

In [None]:
thresholds = np.linspace(0.1, 0.9, 20)
from sklearn.metrics import roc_curve
from plotly import graph_objects as go
from sklearn.metrics import classification_report

def plot_roc(labels, predictions):
    fp, tp, thresholds = roc_curve(labels, predictions)
    fig1 = px.line(x=100*fp, y=100*tp).update_traces(line_color='red')
    fig2 = px.line(x=100*fp, y=100*thresholds).update_traces(line_color='yellow')
    fig = go.Figure()
    fig.add_traces(fig1.data)
    fig.add_traces(fig2.data)
    fig.update_xaxes(title_text='False Positive')
    fig.update_yaxes(title_text='True Positive')
    fig.show()
    print(fig1.data)
plot_roc(y_test, pipeline.predict(X_test))

In [None]:
from sklearn.inspection import permutation_importance

feature_importance_test = permutation_importance(pipeline, X_test, y_test,
                            n_repeats=30,
                            random_state=0)

In [None]:
fi = pd.DataFrame(feature_importance_test['importances'].T, columns= list(X_train.columns))
plt.boxplot(fi.T)
plt.show()

In [None]:
px.box(fi)

In [None]:
transfored_X_train = pipeline['column_transform'].transform(X_train)
transfored_X_train = pd.DataFrame(transfored_X_train)
transfored_X_train.head()

In [None]:
fi.head()