In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
import missingno as msno
import os
from helper_functions import open_table_list_columns

In [None]:
DATADIR = os.getenv('DATADIR')

### client table

In [None]:
client = open_table_list_columns(DATADIR, 'Client')

In [None]:
print("There are {} rows in client and {} unique clients (Serial).\nTherefore is {} that rows uniquely identify clients".format(client.shape[0], client.Serial.nunique(), (client.shape[0]==client.Serial.nunique())))

## Discharge

In [None]:
discharge = open_table_list_columns(DATADIR,'Discharge')

In [None]:
print("There are {} rows in discharge and {} unique clients (Serial).\nTherefore is {} that rows uniquely identify clients".format(discharge.shape[0], discharge.Serial.nunique(), (discharge.shape[0]==discharge.Serial.nunique())))
print("On average there are {} discharges per client".format(discharge.shape[0]/discharge.Serial.nunique()))

### episodes

In [None]:
episode = open_table_list_columns(DATADIR,'Episode')

In [None]:
episode.shape

In [None]:
episode.Serial.nunique()

### referral

In [None]:
referral = open_table_list_columns(DATADIR,'Referral')

### Construct episode-level dataset
referral --> episode --> discharge

### Join referral to episode
referral --> episode 

All episodes are expected to have a referral  
Are all referrals expected to have an episode?

In [None]:
ref_epi = pd.merge(referral, 
                         episode, 
                         on=['Serial',   'Episode'], 
                         how='outer', 
                         indicator=True, 
                         validate="1:1"
                        )

In [None]:
ref_epi.groupby('_merge').size()

Hooray, all episodes have a referral and all referrals have an episode!

In [None]:
# drop _merge

ref_epi = ref_epi.drop('_merge', axis=1).copy()

### Join referral_episode to discharge
referral --> episode --> discharge

All discharges are expected to have an episode  
Not all episodes are expected to have a discharge

In [None]:
ref_epi_disch = pd.merge(ref_epi, 
                         discharge, 
                         on=['Serial',   'Episode'], 
                         how='outer', 
                         indicator=True, 
                         validate="1:1"
                        )

In [None]:
ref_epi_disch.groupby('_merge').size()

There's 1 discharge without an episode...

In [None]:
ref_epi_disch[ref_epi_disch['_merge']=='right_only']

In [None]:
ref_epi_disch['merge_epi_disch'] = ref_epi_disch['_merge']
ref_epi_disch = ref_epi_disch.drop('_merge', axis=1).copy()

### construct episode-level data joined to client

### Join referral_episode_discharge to client
ref_epi_disch --> client

All clients expected to have a referral or episode or discharge  
All episodes/referrals/discharges are expected to have a client

In [None]:
client_ref_epi_disch = pd.merge(client, 
                         ref_epi_disch, 
                         on=['Serial'], 
                         how='outer', 
                         indicator=True, 
                         validate="1:m"
                        )

In [None]:
client_ref_epi_disch.groupby('_merge').size()

In [None]:
client_ref_epi_disch[client_ref_epi_disch['_merge']=='right_only'].head()

In [None]:
client_ref_epi_disch = client_ref_epi_disch.drop('_merge', axis=1).copy()

In [None]:
client_ref_epi_disch['serial_epi'] = client_ref_epi_disch["Serial"].map(str) + "_" + client_ref_epi_disch["Episode"].map(str)

### Substances
client_ref_epi_disch --> substances

All episodes expected to have a substance profile from initial assessment  
All substance profile expected to be linked to an episode

In [None]:
substances = open_table_list_columns(DATADIR,'ClientSubstanceType')

In [None]:
substances.head()

In [None]:
substances['serial_epi'] = substances["Serial"].map(str) + "_" + substances["Episode"].map(str)

In [None]:
print("There are {} rows in substances and {} unique client episodes (serial_epi).\n \
Therefore is {} that rows uniquely identify client episodes".format(
    substances.shape[0], substances.serial_epi.nunique(), 
    (substances.shape[0]==substances.serial_epi.nunique())))
print("On average there are {} substances per client episode".format(
    substances.shape[0]/substances.serial_epi.nunique()))

At this stage, don't want to go to a substancetype-level dataframe, so don't merge

In [None]:
primary_sub = substances[substances['SubstanceStatus']=='1'].copy()

In [None]:
print("There are {} rows in primary_sub and {} unique client episodes (serial_epi).\n \
Therefore is {} that rows uniquely identify client episodes".format(
    primary_sub.shape[0], primary_sub.serial_epi.nunique(), 
    (primary_sub.shape[0]==primary_sub.serial_epi.nunique())))
print("On average there are {} substances per client episode".format(
    primary_sub.shape[0]/primary_sub.serial_epi.nunique()))

There should only be one primary substance type per client episode and the 132 rows where there are >1 are likely to be data entry errors (substancestatus==1, 2, 3 etc is likely to generate errors.). 

Small number of errors so drop duplicates...

In [None]:
primary_sub.drop_duplicates('serial_epi', inplace=True)

In [None]:
print("There are {} rows in primary_sub and {} unique client episodes (serial_epi).\n \
Therefore is {} that rows uniquely identify client episodes".format(
    primary_sub.shape[0], primary_sub.serial_epi.nunique(), 
    (primary_sub.shape[0]==primary_sub.serial_epi.nunique())))
print("On average there are {} substances per client episode".format(
    primary_sub.shape[0]/primary_sub.serial_epi.nunique()))

In [None]:
client_ref_epi_disch_substance = pd.merge(client_ref_epi_disch, 
                                          primary_sub, on=['serial_epi'], 
                                          how='outer', 
                                          indicator=True, 
                                          validate="1:1"
                                         )

In [None]:
client_ref_epi_disch_substance.to_csv(os.path.join(DATADIR, 'rich_episode_level.csv.gz'), 
                                      compression='gzip')

### TOP
client_ref_epi_disch --> top

All episodes expected to have at least one top assessment  
All top assessments expected to linked to an episode

In [None]:
top = open_table_list_columns(DATADIR,'TOP')

In [None]:
top.shape

In [None]:
top['serial_epi'] = top["Serial"].map(str) + "_" + top["Episode"].map(str)

In [None]:
print("There are {} rows in top and {} unique client episodes (serial_epi).\n \
Therefore is {} that rows uniquely identify client episodes".format(
    top.shape[0], top.serial_epi.nunique(), 
    (top.shape[0]==top.serial_epi.nunique())))
print("On average there are {} tops per client episode".format(
    top.shape[0]/top.serial_epi.nunique()))

At this stage, don't want to go to a TOP-level dataframe, so don't merge

In [None]:
epi_top = pd.merge(client_ref_epi_disch, 
                         top, 
                         on=['Serial', 'Episode'], 
                         how='outer', 
                         indicator=True, 
                         validate="1:m"
                        )

In [None]:
epi_top.groupby('_merge').size()

A lot of episodes without a TOP assessment.  
Check again once population constircted to last 2 years

In [None]:
epi_top['merge_epi_top'] = epi_top['_merge']
epi_top = epi_top.drop('_merge', axis=1).copy()

In [None]:
pd.crosstab(client_epi_disch['disch_not_epi'], client_epi_disch['Status'])