# Episode table

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
import missingno as msno
from helper_functions import open_table_list_columns, groupby_percent, groupby_plotsize, derive_episode_vars, km_byvar, derive_discharge_vars
import os

from lifelines import KaplanMeierFitter
from lifelines.utils import datetimes_to_durations

DATADIR = os.getenv('DATADIR')

### Read in data and derive variables

In [None]:
episode = open_table_list_columns(DATADIR, 'Episode')

Tier2 = structured treatmnet -regular 1to1 sessions
Tier3 = unstructured -dropin to group work
Tier4 = signposting

In [None]:
episode.head()

In [None]:
episode = derive_episode_vars(episode).copy()

### Counts

In [None]:
episode.shape

In [None]:
episode.Serial.nunique()

In [None]:
sum(episode.EndDate.isna())

In [None]:
sum(episode.EndDate.isna())/episode.Serial.nunique()

Less than 10% of episodes with missing end date (presumed to be in ongoing treatment?)

### Tiers
- Tier2 = structured treatmnet -regular 1to1 sessions
- Tier3 = unstructured -dropin to group work
- Tier4 = signposting

In [None]:
groupby_percent(episode, 'Tier2', 'Serial')

In [None]:
groupby_percent(episode, 'Tier3', 'Serial')

In [None]:
groupby_percent(episode, 'Tier4', 'Serial')

In [None]:
episode.Tier2.dtype

What does it mean if all Tiers are 0?

In [None]:
print("{} ({}%) episodes are not classified into Tier system".format(len(episode[(episode['Tier2'] == '0') & (episode['Tier3'] == '0') & (episode['Tier4'] == '0')]),
                                                                    len(episode[(episode['Tier2'] == '0') & (episode['Tier3'] == '0') & (episode['Tier4'] == '0')])/episode.shape[0] *100))

In [None]:
print("{}% of episodes are both Tier2 and Tier 3".format(len(episode[(episode['Tier2'] == '1') & (episode['Tier3'] == '1')])/episode.shape[0] *100))

So the Tier thing doesn't make much sense to me now...

### Episode duration

In [None]:
type(episode['episode_duration'][0])

In [None]:
print("{} episodes have a start date after end date. This represents {}% of episodes".format(
    len(episode[episode['episode_duration'] < pd.Timedelta(0)]),
    len(episode[episode['episode_duration'] < pd.Timedelta(0)])/episode.shape[0] *100))

537 episodes have a start date after the end date. These could be:  
1) removed from data  
2) reversed, assuming data entry
3) Manually checked and errors determined
4) checked with source for data entry errors


For now remove these episodes as they only represent 0.19% of data

In [None]:
episode = episode[episode['episode_duration'] > pd.Timedelta(0)]

In [None]:
episode['episode_duration'].describe()

In [None]:
(episode['episode_duration'] / pd.Timedelta(days=1)).hist(bins =100)

In [None]:
len(episode[episode['episode_duration'] > pd.Timedelta(weeks=520)])

In [None]:
(episode['episode_duration'] / pd.Timedelta(days=1)).hist(bins =100)

In [None]:
last3years = episode[episode['start_date'] > '2015-01-01']

In [None]:
last3years.shape

In [None]:
len(last3years[(last3years['Tier2'] == '1') & (last3years['Tier3'] == '1')])/last3years.shape[0] *100

### read in data with referral, discharge, client and primary substance info

In [None]:
rich_episode_level = open_table_list_columns(DATADIR, "rich_episode_level", compression='gzip')

In [None]:
rich_episode_level['Serial'] = rich_episode_level['Serial_x']

In [None]:
rich_episode_level = derive_episode_vars(rich_episode_level).copy()

In [None]:
rich_episode_level = derive_discharge_vars(rich_episode_level, DATADIR)

### Survival analyses

In [None]:
rich_episode_level = rich_episode_level[rich_episode_level['episode_duration'] > pd.Timedelta(0)].copy()

T, E = datetimes_to_durations(rich_episode_level.start_date, rich_episode_level.end_date)

kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)

print("The median days in an episode, which defines the day when on average 1/2 of episodes have ended={}".format(kmf.median_))
kmf.plot()

In [None]:
T, E = datetimes_to_durations(rich_episode_level.start_date, rich_episode_level.end_date)

kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)

km_byvar(rich_episode_level, "IsYP", T, E, xlim=(0, 150))

In [None]:
km_byvar(rich_episode_level, "IsMainDrugInjected", T, E, xlim=(0, 400))
indicator = (rich_episode_level["IsMainDrugInjected"] == "1")
kmf.fit(T[indicator], event_observed=E[indicator])
print(kmf.survival_function_.iloc[398:405])

kmf.fit(T[~indicator], event_observed=E[~indicator])
print(kmf.survival_function_.iloc[398:405])

In [None]:
km_byvar(rich_episode_level, "IsHR", T, E, xlim=(0, 400))

In [None]:
km_byvar(rich_episode_level, "IsCO", T, E, xlim=(0, 400))

In [None]:
reasons = rich_episode_level['collapsed_dreason'].unique()

fig = plt.figure(figsize=(20,20))
for i,reason in enumerate(reasons):
   
    if type(reason) == str:
        ax = plt.subplot(4, 4, i+1)
        ix = rich_episode_level['collapsed_dreason'] == reason
        kmf.fit( T[ix], E[ix], label=reason)
        print('Median number of days in service for {}: {}'.format(reason, kmf.median_))
        kmf.plot(ax=ax, legend=False)
        plt.title(reason)
        plt.xlim(0, 1000)
        plt.annotate("median (days):{}".format(kmf.median_), xy=(400, 0.7))
        if i==0:
            plt.ylabel('Frac. in service after $n$ days')
plt.tight_layout()

In [None]:
reasons = rich_episode_level['collapsed_dreason'].unique()

median_duration_by_reason = pd.DataFrame()
for i,reason in enumerate(reasons):
    if type(reason) == str:
        ix = rich_episode_level['collapsed_dreason'] == reason
        number = rich_episode_level[ix].shape[0]
        kmf.fit( T[ix], E[ix], label=reason)
        d = {'reason': [reason], 'median_days': [kmf.median_], 'number_episodes': [number]}
        median_duration_by_reason = median_duration_by_reason.append(pd.DataFrame(d))

median_duration_by_reason.sort_values('median_days')

In [None]:
median_duration_by_reason.sort_values('median_days').plot(x='reason', y='median_days', kind='barh', figsize=(10, 5), color='#2B8CC4')

In [None]:
reasons = rich_episode_level['df_reason'].unique()

median_duration_by_reason = pd.DataFrame()
for i,reason in enumerate(reasons):
    if type(reason) == str:
        ix = rich_episode_level['df_reason'] == reason
        number = rich_episode_level[ix].shape[0]
        kmf.fit( T[ix], E[ix], label=reason)
        d = {'reason': [reason], 'median_days': [kmf.median_], 'number_episodes': [number]}
        median_duration_by_reason = median_duration_by_reason.append(pd.DataFrame(d))

median_duration_by_reason.sort_values('median_days')

In [None]:
rich_episode_level['Status'].value_counts()


In [None]:
reasons = rich_episode_level[rich_episode_level['Status']=='Discharged']['df_reason'].unique()

median_duration_by_reason = pd.DataFrame()
for i,reason in enumerate(reasons):
    if type(reason) == str:
        ix = rich_episode_level['df_reason'] == reason
        kmf.fit( T[ix], E[ix], label=reason)
        d = {'reason': [reason], 'median_days': [kmf.median_]}
        median_duration_by_reason = median_duration_by_reason.append(pd.DataFrame(d))

median_duration_by_reason.sort_values('median_days')

In [None]:
services = rich_episode_level['Service'].unique()

median_duration_by_reason = pd.DataFrame()
for i,service in enumerate(services):
    if type(service) == str:
        ix = rich_episode_level['Service'] == service
        kmf.fit( T[ix], E[ix], label=service)
        d = {'service': [service], 'median_days': [kmf.median_]}
        median_duration_by_reason = median_duration_by_reason.append(pd.DataFrame(d))

median_duration_by_reason.sort_values('median_days')

In [None]:
services = rich_episode_level[rich_episode_level['collapsed_dreason']== 'treatment completed - drug free']['Service'].unique()
drug_free = rich_episode_level[rich_episode_level['collapsed_dreason']== 'treatment completed - drug free'].copy()
T_df, E_df = datetimes_to_durations(drug_free.start_date, drug_free.end_date)

median_duration_by_reason = pd.DataFrame()
for i,service in enumerate(services):
    if type(service) == str:
        ix = drug_free['Service'] == service
        number = drug_free[ix].shape[0]
        kmf.fit( T_df[ix], E_df[ix], label=service)
        d = {'service': [service], 'median_days': [kmf.median_], 'number_episodes': [number]}
        median_duration_by_reason = median_duration_by_reason.append(pd.DataFrame(d))

median_duration_by_reason.sort_values('median_days', ascending=False)



In [None]:
median_duration_by_reason[median_duration_by_reason['median_days']>123].shape

In [None]:
median_duration_by_reason.nlargest(10, 'median_days')

In [None]:
drug_free['Service'].nunique()

In [None]:
services = rich_episode_level[rich_episode_level['collapsed_dreason']== 'Treatment completed – alcohol-free']['Service'].unique()
alcohol_free = rich_episode_level[rich_episode_level['collapsed_dreason']== 'Treatment completed – alcohol-free'].copy()
T_af, E_af = datetimes_to_durations(alcohol_free.start_date, alcohol_free.end_date)

median_duration_by_reason = pd.DataFrame()
for i,service in enumerate(services):
    if type(service) == str:
        ix = alcohol_free['Service'] == service
        number = alcohol_free[ix].shape[0]
        kmf.fit( T_af[ix], E_af[ix], label=service)
        d = {'service': [service], 'median_days': [kmf.median_], 'number_episodes': [number]}
        median_duration_by_reason = median_duration_by_reason.append(pd.DataFrame(d))

median_duration_by_reason.sort_values('median_days', ascending=False)



In [None]:
median_duration_by_reason[median_duration_by_reason['median_days']>134].shape

In [None]:
median_duration_by_reason.nlargest(20, 'median_days')

In [None]:
rich_episode_level_codes = pd.read_csv(os.path.join(DATADIR, 'Referral_codes.csv'), dtype=object, delimiter='|', encoding = "ISO-8859-1")
rich_episode_level_codes = dict(zip(rich_episode_level_codes.Code, rich_episode_level_codes.Text))
rich_episode_level['referral_source_label'] = rich_episode_level['ReferralSource'].map(rich_episode_level_codes)

rich_episode_level['collapsed_source'] = 'other'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'Self', 'collapsed_source'] = 'self'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'GP', 'collapsed_source'] = 'GP'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'Arrest rich_episode_level', 'collapsed_source'] = 'arrest rich_episode_level'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'Drug Service Statutory', 'collapsed_source'] = 'statutory drug service'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'CARAT / Prison', 'collapsed_source'] = 'CARAT/prison'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'Drug service non- statutory', 'collapsed_source'] = 'non-statutory drug service'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'Hospital', 'collapsed_source'] = 'hospital'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'Probation', 'collapsed_source'] = 'probation'
rich_episode_level.loc[rich_episode_level['referral_source_label'] == 'Community Alcohol Team', 'collapsed_source'] = 'community alcohol team'

In [None]:
reasons = rich_episode_level['collapsed_source'].unique()
T, E = datetimes_to_durations(rich_episode_level.start_date, rich_episode_level.end_date)
fig = plt.figure(figsize=(20,20))
for i,reason in enumerate(reasons):
   
    if type(reason) == str:
        ax = plt.subplot(4, 4, i+1)
        ix = rich_episode_level['collapsed_source'] == reason
        kmf.fit( T[ix], E[ix], label=reason)
        print('Median number of days in service for {}: {}'.format(reason, kmf.median_))
        kmf.plot(ax=ax, legend=False)
        plt.title(reason)
        plt.xlim(0, 200)
        plt.annotate("median (days):{}".format(kmf.median_), xy=(100, 0.7))
        if i==0:
            plt.ylabel('Frac. in service after $n$ days')
plt.tight_layout()

In [None]:
alcohol_free['Service'].nunique()