In [1]:
from sqlalchemy import create_engine, event
import urllib.parse

import pandas as pd
from pandas.tseries.offsets import DateOffset
import datetime as dt
import numpy as np
import time
import re

# Inputs & Outputs

In [2]:
# SQL database
server = 'CSKMA0400\RDB_Data'
db = 'JLDJobPath'
odbc_connection_string = 'DRIVER={SQL Server Native Client 11.0};SERVER='+server+';DATABASE='+db+';Trusted_Connection=yes'

# Unique ID variable for sql tables
uid_var = 'ppsn'

# Input sql tables
clusters_sql_tables = [
                       'linkedclaims_casuals_2018m04_v2_flat_20140401_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20140701_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20141001_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20150101_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20150401_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20150701_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20151001_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20160101_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20160401_with_income_36Vars__7BGM_full_clusters',
                       'linkedclaims_casuals_2018m04_v2_flat_20160701_with_income_36Vars__7BGM_full_clusters'
                       ]

flat_jld_sql_tables = [
                       'linkedclaims_casuals_2018m04_v2_flat_20140401_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20140701_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20141001_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20150101_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20150401_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20150701_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20151001_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20160101_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20160401_with_income',
                       'linkedclaims_casuals_2018m04_v2_flat_20160701_with_income'
                       ]
fjld_variable_age = 'age'
fjld_variable_durationdays0 = 'duration_days_0'
fjld_variable_hist_lr0 = 'hist_lr_0'
fjld_variable_occupation0 = 'occupation_0'
fjld_variable_sex = 'sex'
fjld_variable_tdurationdays = 'total_duration_days'
fjld_variable_sum_pflag = 'total_sum_penaltyflag'
flat_jld_selected_variables = [uid_var, 
                               fjld_variable_age,
                                fjld_variable_durationdays0,
                               fjld_variable_hist_lr0,
                               fjld_variable_occupation0,
                               fjld_variable_sex,
                               fjld_variable_tdurationdays,
                               fjld_variable_sum_pflag]

jld_sql_table = 'linkedclaims_casuals_2018m04_v2'
jld_variable_event_start = 'StartDate'
jld_variable_event_end = 'EndDate'
jld_variable_event_type = 'hist_lr'
# jld_variable_previus_event_type = 'PrevLR'
# jld_variable_next_event_type = 'NextLR'
# jld_variable_last_claim = 'LastClaim'
jld_variable_last_lr = 'LastLR'
jld_variable_lr_flag = 'lr_flag'
jld_variable_hist_lls = 'hist_lls'
# jld_variable_first_lls = 'FirstLLS'
# jld_variable_last_lls = 'LastLLS'
jld_variable_occupation_rank = 'occupation_rank'
jld_variable_LM_code_rank = 'LM_code_rank'
jld_variable_ada_code_rank = 'ada_code_rank'
jld_variable_family_flag_rank = 'family_flag_rank'
jld_variable_marital_status_rank = 'marital_status_rank'
jld_selected_variables = [uid_var,
                          jld_variable_event_start,
                          jld_variable_event_end,
                          jld_variable_event_type,
#                           jld_variable_previus_event_type,
#                           jld_variable_next_event_type,
#                           jld_variable_last_claim,
                          jld_variable_last_lr,
                          jld_variable_lr_flag,
                          jld_variable_hist_lls,
#                           jld_variable_first_lls,
#                           jld_variable_last_lls,
                          jld_variable_occupation_rank,
                          jld_variable_LM_code_rank,
                          jld_variable_ada_code_rank,
                          jld_variable_family_flag_rank,
                          jld_variable_marital_status_rank]


# Referral data
data_path = '//cskma0294/F/Evaluations/JobPath/Python/Data/ReferralData/'
referral_data_filename_csv_1 = data_path+'Data for Evaluation Run 23072018 V3.csv'
referral_data_filename_csv_2 = data_path+'2017 Data for Evaluation 04092018.csv'
referral_data_csvfilenames = [referral_data_filename_csv_1, referral_data_filename_csv_2]
rd_date_fields = ['Start Date','Date of Interview','PPP Agreed Date','Date of Cancellation','End Date','Date_paused','Dateresumed']
rd_date_format = '%d/%m/%Y'
rd_string_fields = ['Amended Referral Status','Paused_reason','Reason for Cancellation','Referal Status Description','Local Office Name', 'Cancellationsubcategory' ]
rd_start_date_var = 'Start Date'
rd_interview_date_var = 'Date of Interview'
rd_end_date_var = 'End Date'
rd_pause_date_var = 'Date_paused'
rd_resume_date_var = 'Dateresumed'
rd_paused_reason_var = 'Paused_reason'
rd_cancellation_date_var = 'Date of Cancellation'
rd_cancellation_reason_var = 'Reason for Cancellation'
rd_referral_status_var = 'Referal Status Description'
rd_uid_var = 'Pps No'

# Referral data rollup/dups remove flag vars
referral_status_target = 'Completed'
previous_jp_completed_flag_var = 'prev_jp_completed'

# Analysis parameters
analysis_date_start_strings = [
                               '20140401',
                               '20140701',
                               '20141001',
                               '20150101',
                               '20150401',
                               '20150701',
                               '20151001',
                               '20160101',
                               '20160401',
                               '20160701'
                               ]
analysis_date_end_strings =   [
                               '20160401',
                               '20160401',
                               '20161001',
                               '20170101',
                               '20170401',
                               '20170701',
                               '20171001',
                               '20180101',
                               '20180401',
                               '20180701'
                               ]
analysis_dates_string_format = '%Y%m%d'

number_of_periods = 16
periods_lenght_month = 3

# Output analysis elegible by period
jp_prefix_elegible_var = 'jp_elegible'
jp_eligible_target_var = fjld_variable_durationdays0
jp_eligible_target_var_min_val = 10*30

# Output analysis started jp by period
jp_prefx_started_var = 'jp_started'
jp_prefx_started_target_var = rd_interview_date_var

# Output Analysis: imputed end date
imputed_ph1_end_date_var = 'imputed_ph1_end_date'
imputed_ph1_end_date_offset_months = 12

# Output Analysis fields summary within the analysis window
analysis_start_date_var = 'analysis_start_date'
analysis_end_date_var = 'analysis_end_date'
days_in_treatment_in_aw_var = 'jp_days_in_aw'
days_before_streatment_start_in_aw_var = 'jp_referral_days_in_aw'
days_paused_in_aw_var = 'jp_paused_days_in_aw'
jobpath_flag_in_aw_var = 'jp_flag_in_aw'
cancellation_flag_in_aw_var = 'cancelled_flag_in_aw'
paused_flag_in_aw_var = 'paused_flag_in_aw'
jp_started_before_aw_completed_ph1_in_aw_var = 'jp_started_before_aw_completed_ph1_in_aw'
jp_started_in_aw_completed_ph1_after_aw_var = 'jp_started_in_aw_completed_ph1_after_aw'
jp_ph1_completed_in_aw_firstQ_year2_var = 'jp_ph1_completed_in_aw_firstQ_year2_var'
firstQ_year2_offset_month_min = 12
firstQ_year2_offset_month_max = 15
jp_ph1_completed_in_aw_secondQ_year2_var = 'jp_ph1_completed_in_aw_secondQ_year2_var'
secondQ_year2_offset_month_min = 15
secondQ_year2_offset_month_max = 18
jp_ph1_completed_in_aw_thirdQ_year2_var = 'jp_ph1_completed_in_aw_thirdQ_year2_var'
thirdQ_year2_offset_month_min = 18
thirdQ_year2_offset_month_max = 21
jp_ph1_completed_in_aw_fourthQ_year2_var = 'jp_ph1_completed_in_aw_fourthQ_year2_var'
fourthQ_year2_offset_month_min = 21
fourthQ_year2_offset_month_max = 24
jp_ph1_not_completed_in_aw_var = 'jp_ph1_not_completed_in_aw'
jp_cancelled_before_start_in_aw_var = 'jp_cancelled_before_start_in_aw'
jp_cancelled_in_aw_start_before_aw_var = 'jp_cancelled_in_aw_start_before_aw'
jp_cancelled_in_aw_start_in_aw_var = 'jp_cancelled_in_aw_start_in_aw_var'
jp_cancelled_afer_aw_start_in_aw_var = 'jp_cancelled_afer_aw_start_in_aw'

# Output Analysis fields: summary before the analysis window
jobpath_flag_before_aw_var = 'jp_flag_before_aw'
cancellation_flag_before_aw_var = 'cancelled_flag_before_aw'
paused_flag_before_aw_var = 'paused_flag_before_aw'
completed_ph1_jp_before_aw_var = 'completed_ph1_jp_before_aw'

# Output Analysis fields: summary after the analysis window
jobpath_flag_after_aw_var = 'jp_flag_after_aw'
started_jp_after_aw_var = 'started_jp_after_aw'

# Output Post Processing Fields:
jobpath_category_in_aw_var = 'jobpath_category_in_aw'

# Output suffix
upload_to_sql = False
export_to_csv = True
out_suffix = '_jp_summary_status'
csv_path = '//cskma0294/F/Evaluations/JobPath/Python/Analysis/JPOutcomes/'

In [3]:
def batch(iterable, n = 1):
    current_batch = []
    for item in iterable:
        current_batch.append(item)
        if len(current_batch) == n:
            yield current_batch
            current_batch = []
    if current_batch:
        yield current_batch

def read_data_from_sql(sql_table):
    # Connect to SQL
    params = urllib.parse.quote_plus(odbc_connection_string)
    engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
    conn = engine.connect().connection

    @event.listens_for(engine, 'before_cursor_execute')
    def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
        if executemany:
            cursor.fast_executemany = True

    df = pd.read_sql_table(sql_table, engine)
    conn.close()
    return df

def upload_df_to_sql(df, sql_table):
    # Load into SQL
    # Connect to SQL
    params = urllib.parse.quote_plus(odbc_connection_string)
    engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
    conn = engine.connect().connection
    cursor = conn.cursor()
    
    # SpeedUp For fast execution of mutiple row 
    @event.listens_for(engine, 'before_cursor_execute')
    def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
        if executemany:
            cursor.fast_executemany = True

    # Drop table if exists
    sql_string_drop = "IF OBJECT_ID('"+ sql_table + "', 'U') IS NOT NULL" +'\n'+ "DROP TABLE " + sql_table
    cursor.execute(sql_string_drop)
    conn.commit()
    
    #upload data
    print('\nUploading to SQL')
    sql_chunksize = 10000
    df.to_sql(sql_table, engine, if_exists='append', index=False, chunksize=sql_chunksize)
    #Close SQL Connection
    conn.close()
    
def remove_dups_in_referral_data(df):
    data = df.copy()
    
    data [previous_jp_completed_flag_var] = 0
    data = data.sort_values(uid_var)
    dups = data.duplicated(subset=uid_var, keep=False)
    
    res = data[dups == False]
    dups = data[dups == True]
    
    uids = dups[uid_var].tolist()
    batch_size = len(uids)/20
    for batch_uids in batch(uids, n=batch_size):
        batch_dups = dups.loc[dups[uid_var].isin(batch_uids)]
        grp_data = batch_dups.groupby(uid_var)
        for uid, group in grp_data:
            if (pd.isna(uid) == 0  & pd.isnull(uid) == 0):
                mygroup = group.sort_values([rd_start_date_var,
                                             rd_pause_date_var,
                                             rd_resume_date_var,
                                             rd_cancellation_date_var,
                                             rd_end_date_var], 
                                            ascending = [False,
                                                         False,
                                                         False,
                                                         False,
                                                         False])
                to_append = mygroup.iloc[0].copy()
                to_check = mygroup[1:]
                if len( to_check.loc[to_check[rd_referral_status_var] == referral_status_target] > 1 ) :
                       to_append[previous_jp_completed_flag_var] = 1

                res = res.append(to_append)
    return res

def assign_status(df, status_source_col, status_target_col):
    # Status assignment rules:
    #     'UBCO' -> "On Live Register - Credits only", 
    #     'UBSEMPCAS|UBSST|UBSTEA|UASEMP|UABTWE|UABTWP|DABTWP|UAPTJI' -> "Employment/Self-employment supported by DSP", 
    #     'C-UA|UACAS|UASPRICAS|UAPTEOCAS|UAOFPXCAS' -> "On Live Register (casual worker) - JA",
    #     'C-UB|UBCAS|UBPTEOCAS' -> "On Live Register (casual worker) - JB",
    #     'UAINTN|UAMOM|UATLO|UAWPG' -> "In Education, Training or Employment Placement Course", 
    #     'UAFASS|UAFISH|FISH' -> "Closed off the Live Register for other reasons",
    #     'UA' - > "On Live Register (excluding casual workers) - JA",
    #     'UB' - > "On Live Register (excluding casual workers) - JB",
    #     'SST|STEA|SEMP|BTW|BTWFD|PTJI|FIS' -> "Employment/Self-employment supported by DSP",
    #     'DUPS|OFP|OFPJST|OFPLPSSJST|OFPL|OFPS|OFPU|OFPJ|PRTA|FASS|UAFASS' -> "Closed off the Live Register for other reasons",
    #     'CE|FAS|ICTP|INTN|UAINTN|LMAF|LMEF|WPGO|MOM|SPFT|ES|TLO|SLO' -> "In Education, Training or Employment Placement Course", 
    #     'EMPL' -> "In employment"
    
    data = df.copy()
    
    conditions = [ ~(data[status_source_col].str.contains('CL', regex=True)) & (data[status_source_col].str.contains('UA|UB', regex=True)) & ((data[status_source_col].str.contains('UBCO', regex=True))),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & (data[status_source_col].str.contains('UA|UB', regex=True)) & ((data[status_source_col].str.contains('UBSEMPCAS|UBSST|UBSTEA|UASEMP|UABTWE|UABTWP|DABTWP|UAPTJI', regex=True))),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & (data[status_source_col].str.contains('UA|UB', regex=True)) & ((data[status_source_col].str.contains('C-UA|UACAS|UASPRICAS|UAPTEOCAS|UAOFPXCAS', regex=True))),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & (data[status_source_col].str.contains('UA|UB', regex=True)) & ((data[status_source_col].str.contains('C-UB|UBCAS|UBPTEOCAS', regex=True))),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & (data[status_source_col].str.contains('UA|UB', regex=True)) & ((data[status_source_col].str.contains('UAINTN|UAMOM|UATLO|UAWPG', regex=True))),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & (data[status_source_col].str.contains('UA|UB', regex=True)) & ((data[status_source_col].str.contains('UAFASS|UAFISH|FISH', regex=True))),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & (data[status_source_col].str.contains('UA|UB', regex=True)) & ((data[status_source_col].str.contains('UA', regex=True))),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & (data[status_source_col].str.contains('UA|UB', regex=True)) & ((data[status_source_col].str.contains('UB', regex=True))),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & ~(data[status_source_col].str.contains('UA|UB', regex=True))  & (data[status_source_col].str.contains('SST|STEA|SEMP|BTW|BTWFD|PTJI|FIS', regex=True)), 
                   ~(data[status_source_col].str.contains('CL', regex=True)) & ~(data[status_source_col].str.contains('UA|UB', regex=True))  & (data[status_source_col].str.contains('DUPS|OFP|OFPJST|OFPLPSSJST|OFPL|OFPS|OFPU|OFPJ|PRTA|FASS|UAFASS', regex=True)),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & ~(data[status_source_col].str.contains('UA|UB', regex=True))  & (data[status_source_col].str.contains('CE|FAS|ICTP|INTN|UAINTN|LMAF|LMEF|WPGO|MOM|SPFT|ES|TLO|SLO', regex=True)),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & ~(data[status_source_col].str.contains('UA|UB', regex=True))  & ~(data[status_source_col].str.contains('EMPL', regex=True)),
                   ~(data[status_source_col].str.contains('CL', regex=True)) & ~(data[status_source_col].str.contains('UA|UB', regex=True))  & (data[status_source_col].str.contains('SST|STEA|SEMP|BTW|BTWFD|PTJI|FIS|DUPS|OFP|OFPJST|OFPLPSSJST|OFPL|OFPS|OFPU|OFPJ|PRTA|FASS|UAFASS|CE|FAS|ICTP|INTN|UAINTN|LMAF|LMEF|WPGO|MOM|SPFT|ES|TLO|SLO|EMPL', regex=True)),
                   (data[status_source_col] != ' ')                    
                 ]


    choices_four = ["On Live Register", 
                    "In employment", 
                    "On Live Register", 
                    "On Live Register", 
                    "In Education, Training or Employment Placement Course", 
                    "Closed off the Live Register for other reasons", 
                    "On Live Register", 
                    "On Live Register", 
                    "In employment", 
                    "Closed off the Live Register for other reasons",  
                    "In Education, Training or Employment Placement Course", 
                    "In employment", 
                    "Closed off the Live Register for other reasons", 
                    data[status_source_col]]

    data[status_target_col+'_simple'] = np.select(conditions, choices_four, "Closed off the Live Register for other reasons")


    choices_det = ["On Live Register - Credits only", 
                   "Employment/Self-employment supported by DSP", 
                   "On Live Register (casual worker) - JA", 
                   "On Live Register (casual worker) - JB", 
                   "In Education, Training or Employment Placement Course", 
                   "Closed off the Live Register for other reasons", 
                   "On Live Register (excluding casual workers) - JA", 
                   "On Live Register (excluding casual workers) - JB", 
                   "Employment/Self-employment supported by DSP", 
                   "Closed off the Live Register for other reasons", 
                  "In Education, Training or Employment Placement Course", 
                   "In employment", 
                   data[status_target_col+'_simple'], 
                   data[status_target_col+'_simple'] ]

    data[status_target_col] = np.select(conditions, choices_det, data[status_target_col+'_simple'])
               
    return data

def query_jld(variable_uid, uids, jld_sql_table, select_variables, date_var = None, min_date = None, max_date = None):
    # Connect to SQL
    params = urllib.parse.quote_plus(odbc_connection_string)
    engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
    conn = engine.connect().connection

    @event.listens_for(engine, 'before_cursor_execute')
    def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
        if executemany:
            cursor.fast_executemany = True
    
    sql_query_string = "SELECT " + (',').join(select_variables) + " FROM " + jld_sql_table + \
                        " WHERE " +variable_uid+ " IN " + str(uids) 

    data = pd.read_sql_query(sql_query_string, engine)
    
    # Close SQL connection
    conn.close()
    
    return data

def select_rows_in_date(df, date, variable_min_date = None, variable_max_date = None):
    res = df.copy()
    if variable_min_date is not None and variable_max_date is not None:
        res = res.loc[(df[variable_min_date] < date) & (df[variable_max_date] > date)]
    elif variable_min_date is not None:
        res = res.loc[(df[variable_min_date] < date)]
    elif variable_max_date is not None:
        res = res.loc[(df[variable_max_date] > date)]
    
    return res

def check_jld (df, quarter_start, quarter_end, suffix):
#     start_time = time.time()
    res = df.copy()
    
    uids = res[uid_var].tolist()
    
    batch_dfs = []
    batch_size = 10000
    jld_data = pd.DataFrame()
    for batch_uids in batch(uids, n=batch_size):
        mybatch_uids = tuple(batch_uids)
        batch_data = query_jld(uid_var, mybatch_uids, jld_sql_table, jld_selected_variables)
        batch_dfs.extend(batch_data.to_dict('records'))
    jld_data = pd.DataFrame(batch_dfs)
#     print (jld_data)
#     print ('Batch query done')
#     elapsed_time = time.time() - start_time
#     print ('Elapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    
    jld_data = select_rows_in_date(df=jld_data, 
                                   date=quarter_end, 
                                   variable_min_date=jld_variable_event_start)
#     print ('Select rows done')
#     elapsed_time = time.time() - start_time
#     print ('Elapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    
    batch_dfs = []
    batch_size = 5000
#     print ("N Batches = %d" %(len(uids)/batch_size))
    for batch_uids in batch(uids, n=batch_size):
        batch_jld_data = jld_data.loc[jld_data[uid_var].isin(batch_uids)]
        grp_data = batch_jld_data.groupby(uid_var)

        for uid, group in grp_data:
            mygroup = group.sort_values([jld_variable_event_end, 
                                         jld_variable_lr_flag,
                                         jld_variable_event_type, 
                                         jld_variable_event_start], 
                                        ascending = [False,False,False,False])
            batch_dfs.append(mygroup.iloc[0].to_dict())
    
    jld_data_single = pd.DataFrame(batch_dfs)
    
#     print ('Single row JLD done')
#     elapsed_time = time.time() - start_time
#     print ('Elapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    
#     print ('Attach status')
    jld_data_single = assign_status(jld_data_single, jld_variable_last_lr, 'status')
#     elapsed_time = time.time() - start_time
#     print ('Elapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    
#     print (jld_data_single)
    
#     print (jld_data_single.columns)
    # Renane JLD cols before joining info
    cols_name = jld_data_single.columns
    for cname in cols_name:
        if cname != uid_var:
            jld_data_single.rename(columns={cname:str(cname)+suffix}, inplace=True)
    
    # Join info
    res = pd.merge(res, jld_data_single, on=uid_var, how='left')
    
    # Set to 0 'lr_flag' for ppsn not in JLD at date
    res[jld_variable_lr_flag+suffix].fillna(0, inplace=True)
    
    return res

def summarise_data_after_aw(df, end_analysis_window_date):
    res = df.copy()
    
    ## set jobpath flag after analysis window
    res [jobpath_flag_after_aw_var] = 0
    
    # if date start > analysis end, and date start exists
    res.loc[(pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_start_date_var] > end_analysis_window_date),
            jobpath_flag_after_aw_var] = 1
    # if date end > analysis end, and date end exists
    res.loc[(pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_end_date_var] > end_analysis_window_date),
            jobpath_flag_after_aw_var] = 1
    
    ## set started_jp_after_aw_var
    res [started_jp_after_aw_var] = 0
    # if rd_interview_date_var exists
    res.loc[(pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] > end_analysis_window_date),
            started_jp_after_aw_var ] = 1
    
    return res
        
    
def summarise_data_before_aw(df, start_analysis_window_date):
    res = df.copy()
    
    ## set cancelled flag before analysis window
    res[cancellation_flag_before_aw_var] = 0
    # if cancellation date defined and before analysis window
    res.loc[(pd.isna(res[rd_cancellation_date_var]) == 0) &
            (res[rd_cancellation_date_var] <  analysis_date_start),
            cancellation_flag_before_aw_var] = 1
    # if cancellation date not defined, cancellation reason defined and start date before analysis window
    res.loc[(pd.isna(res[rd_cancellation_date_var]) == 1) &
            (pd.isna(res[rd_cancellation_reason_var]) == 0) &
            (res[rd_start_date_var] < analysis_date_start),
            cancellation_flag_before_aw_var] = 1
    # if cancellation date not defined, cancellation reason defined and start date not defined
    res.loc[(pd.isna(res[rd_cancellation_date_var]) == 1) &
            (pd.isna(res[rd_cancellation_reason_var]) == 0) &
            (pd.isna(res[rd_start_date_var]) == 1),
            cancellation_flag_in_aw_var] = np.nan
    
    # set pause flag before analysis window
    res [paused_flag_before_aw_var] = 0
    # if resume date present and before analysis start
    res.loc[(pd.isna(res[rd_resume_date_var]) == 0) & 
            (res[rd_resume_date_var] <= analysis_date_start),
            paused_flag_before_aw_var] = 1
    
    ## set jobpath flag before analysis window
    res [jobpath_flag_before_aw_var] = 0
    # if date end < analysis start, and date end exists
    res.loc[(pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_end_date_var] < start_analysis_window_date),
            jobpath_flag_before_aw_var] = 1
    # if paused before analysis window
    res.loc[res[paused_flag_before_aw_var] == 1,
           jobpath_flag_before_aw_var] = 1
    # if cancelled before analysis window
    res.loc[res[cancellation_flag_before_aw_var] == 1,
           jobpath_flag_before_aw_var] = 1
    
    ## set completed_ph1_jp_before_aw_var
    res[completed_ph1_jp_before_aw_var] = 0
    # if date end exists and < analysis start
    res.loc[(pd.isna(res[imputed_ph1_end_date_var]) == 0) &
            (res[imputed_ph1_end_date_var] < start_analysis_window_date), 
            completed_ph1_jp_before_aw_var] = 1
            
    return res


def summarise_data_in_aw(df, start_analysis_window_date, end_analysis_window_date):
    res = df.copy()
    
    res [analysis_start_date_var] = start_analysis_window_date
    res [analysis_end_date_var] = end_analysis_window_date
    
    
    ## set jp_started_before_aw_completed_in_aw_var
    res[jp_started_before_aw_completed_ph1_in_aw_var] = 0
    res.loc[(pd.isna(rd_interview_date_var) == 0) &
            (pd.isna(imputed_ph1_end_date_var) == 0) &
            (res[rd_interview_date_var] < analysis_date_start) &
            (res[imputed_ph1_end_date_var] >= analysis_date_start) &
            (res[imputed_ph1_end_date_var] <= analysis_date_end), 
            jp_started_before_aw_completed_ph1_in_aw_var ] = 1
    
    ## set jp_started_in_aw_completed_ph1_after_aw_var
    res[jp_started_in_aw_completed_ph1_after_aw_var] = 0
    res.loc[(pd.isna(rd_interview_date_var) == 0) &
            (pd.isna(imputed_ph1_end_date_var) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (res[imputed_ph1_end_date_var] > analysis_date_end), 
            jp_started_in_aw_completed_ph1_after_aw_var ] = 1
    
    ## set jp_ph1_completed_in_aw_firstQ_year2_var
    res[jp_ph1_completed_in_aw_firstQ_year2_var] = 0
    res.loc[(pd.isna(imputed_ph1_end_date_var) == 0) &
            (res[imputed_ph1_end_date_var] > analysis_date_start + DateOffset(months=firstQ_year2_offset_month_min)) & 
            (res[imputed_ph1_end_date_var] <= analysis_date_start + DateOffset(months=firstQ_year2_offset_month_max)), 
            jp_ph1_completed_in_aw_firstQ_year2_var] = 1
    
    ## set jp_ph1_completed_in_aw_secondQ_year2_var
    res[jp_ph1_completed_in_aw_secondQ_year2_var] = 0
    res.loc[(pd.isna(imputed_ph1_end_date_var) == 0) &
            (res[imputed_ph1_end_date_var] > analysis_date_start + DateOffset(months=secondQ_year2_offset_month_min)) & 
            (res[imputed_ph1_end_date_var] <= analysis_date_start + DateOffset(months=secondQ_year2_offset_month_max)), 
            jp_ph1_completed_in_aw_secondQ_year2_var] = 1
    
    ## set jp_ph1_completed_in_aw_thirdQ_year2_var
    res[jp_ph1_completed_in_aw_thirdQ_year2_var] = 0
    res.loc[(pd.isna(imputed_ph1_end_date_var) == 0) &
            (res[imputed_ph1_end_date_var] > analysis_date_start + DateOffset(months=thirdQ_year2_offset_month_min)) & 
            (res[imputed_ph1_end_date_var] <= analysis_date_start + DateOffset(months=thirdQ_year2_offset_month_max)), 
            jp_ph1_completed_in_aw_thirdQ_year2_var] = 1
    
    ## set jp_ph1_completed_in_aw_thirdQ_year2_var
    res[jp_ph1_completed_in_aw_fourthQ_year2_var] = 0
    res.loc[(pd.isna(imputed_ph1_end_date_var) == 0) &
            (res[imputed_ph1_end_date_var] > analysis_date_start + DateOffset(months=fourthQ_year2_offset_month_min)) & 
            (res[imputed_ph1_end_date_var] <= analysis_date_start + DateOffset(months=fourthQ_year2_offset_month_max)), 
            jp_ph1_completed_in_aw_fourthQ_year2_var] = 1
    
    ## set jp_ph1_not_completed_in_aw_var
    res[jp_ph1_not_completed_in_aw_var] = 0
    res.loc[(res[jp_ph1_completed_in_aw_firstQ_year2_var] == 0) &
            (res[jp_ph1_completed_in_aw_secondQ_year2_var] == 0) &
            (res[jp_ph1_completed_in_aw_thirdQ_year2_var] == 0) &
            (res[jp_ph1_completed_in_aw_fourthQ_year2_var] == 0),
            jp_ph1_not_completed_in_aw_var] = 1
    
        
    ## set jobpath flag in analysis window
    res[jobpath_flag_in_aw_var] = 0
    #if start date in analysis window
    res.loc[(pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] >= analysis_date_start) &
            (res[rd_start_date_var] <= analysis_date_end),
            jobpath_flag_in_aw_var] = 1
    #if interview date in analysis window
    res.loc[(pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end),
            jobpath_flag_in_aw_var] = 1
    #if end date in analysis window
    res.loc[(pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_end_date_var] >= analysis_date_start) &
            (res[rd_end_date_var] <= analysis_date_end),
            jobpath_flag_in_aw_var] = 1
    #if start date < analysis start and end date > analysis end
    res.loc[(pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_end_date_var] >= analysis_date_end),
            jobpath_flag_in_aw_var] = 1
    #if start date < analysis start , cacellation > analysis start
    res.loc[(pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_cancellation_date_var]) == 0) &
            (res[rd_cancellation_date_var] >= analysis_date_end),
            jobpath_flag_in_aw_var] = 1
    #if start date < analysis start , pause > analysis start
    res.loc[(pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (res[rd_pause_date_var] >= analysis_date_end),
            jobpath_flag_in_aw_var] = 1
            
    
    ## set cancelled flag in analysis window
    res[cancellation_flag_in_aw_var] = 0
    # if cancellation date defined and within analysis window
    res.loc[(pd.isna(res[rd_cancellation_date_var]) == 0) &
            (res[rd_cancellation_date_var] >=  analysis_date_start) &
            (res[rd_cancellation_date_var] <=  analysis_date_end),
            cancellation_flag_in_aw_var] = 1
    # if cancellation date not defined, cancelaltion reason defined and start date within analysis window
    res.loc[(pd.isna(res[rd_cancellation_date_var]) == 1) &
            (pd.isna(res[rd_cancellation_reason_var]) == 0) &
            (res[rd_start_date_var] >=  analysis_date_start) &
            (res[rd_start_date_var] <=  analysis_date_end),
            cancellation_flag_in_aw_var] = 1
    # if cancellation date not defined, cancelaltion reason defined and start date not defined
    res.loc[(pd.isna(res[rd_cancellation_date_var]) == 1) &
            (pd.isna(res[rd_cancellation_reason_var]) == 0) &
            (pd.isna(res[rd_start_date_var]) == 1),
            cancellation_flag_in_aw_var] = np.nan
    
    ## set jp_cancelled_before_start_in_aw_var
    res [jp_cancelled_before_start_in_aw_var] = 0
    res.loc[(res[cancellation_flag_in_aw_var] == 0) &
            (pd.isna(res[rd_interview_date_var])) == 1,
            jp_cancelled_before_start_in_aw_var] = 1
    
    ## set jp_cancelled_in_aw_start_before_aw_var
    res [jp_cancelled_in_aw_start_before_aw_var] = 0
    res.loc[(pd.isna(rd_cancellation_date_var)== 0) &
            (pd.isna(rd_interview_date_var)== 0) &
            (res[rd_cancellation_date_var] >= analysis_date_start) &
            (res[rd_cancellation_date_var] <= analysis_date_end) &
            (res[rd_interview_date_var] < analysis_date_start),
            jp_cancelled_in_aw_start_before_aw_var] = 1
    
    ## set jp_cancelled_in_aw_start_in_aw_var
    res[jp_cancelled_in_aw_start_in_aw_var] = 0
    res.loc[(pd.isna(rd_cancellation_date_var)== 0) &
            (pd.isna(rd_interview_date_var)== 0) &
            (res[rd_cancellation_date_var] >= analysis_date_start) &
            (res[rd_cancellation_date_var] <= analysis_date_end) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end), 
            jp_cancelled_in_aw_start_in_aw_var] = 1
    
    ## set jp_cancelled_afer_aw_start_in_aw_var
    res[jp_cancelled_afer_aw_start_in_aw_var] = 0
    res.loc[(pd.isna(rd_cancellation_date_var)== 0) &
            (pd.isna(rd_interview_date_var)== 0) &
            (res[rd_cancellation_date_var] >= analysis_date_end) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end), 
            jp_cancelled_afer_aw_start_in_aw_var] = 1
    
    ## set paused flag in analysis window
    res [paused_flag_in_aw_var] = 0
    # if pause date present and within analysis start/end
    res.loc[(pd.isna(res[rd_pause_date_var]) == 0) & 
            (res[rd_pause_date_var] <= analysis_date_end) & 
            (res[rd_pause_date_var] >= analysis_date_start),
            paused_flag_in_aw_var] = 1
    # if pause date present and <= analysis start and resume date present and >= analysis start
    res.loc[(pd.isna(res[rd_pause_date_var]) == 0) & 
            (res[rd_pause_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_resume_date_var]) == 0) & 
            (res[rd_resume_date_var] >= analysis_date_start),
            paused_flag_in_aw_var] = 1
    # if pause date present and <= analysis start and resume date not present
    res.loc[(pd.isna(res[rd_pause_date_var]) == 0) & 
            (res[rd_pause_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_resume_date_var]) == 1) ,
            paused_flag_in_aw_var] = 1
    
    
    ## count days paused in analysis window
    res[days_paused_in_aw_var] = 0
    # if missing pause date
    res.loc[ (res[paused_flag_in_aw_var] == 1) & (pd.isna(res[rd_pause_date_var]) == 1), days_paused_in_aw_var] = np.nan
    # if paused and pause date >= analysis start and resume <= analysis end 
    res.loc[(res[paused_flag_in_aw_var] == 1) & 
            (pd.isna(res[rd_pause_date_var]) == 0) & 
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (res[rd_pause_date_var] >= analysis_date_start) &
            (res[rd_resume_date_var] <= analysis_date_end),
            days_paused_in_aw_var] = (res[rd_resume_date_var] - res[rd_pause_date_var]).dt.days
    # if paused and pause date >= analysis start and resume >= analysis end 
    res.loc[(res[paused_flag_in_aw_var] == 1) & 
            (pd.isna(res[rd_pause_date_var]) == 0) & 
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (res[rd_pause_date_var] >= analysis_date_start) &
            (res[rd_resume_date_var] >= analysis_date_end),
            days_paused_in_aw_var] = ( res [analysis_end_date_var] - res[rd_pause_date_var]).dt.days
    # if paused and pause date <= analysis start and resume <= analysis end 
    res.loc[(res[paused_flag_in_aw_var] == 1) & 
            (pd.isna(res[rd_pause_date_var]) == 0) & 
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (res[rd_pause_date_var] <= analysis_date_start) &
            (res[rd_resume_date_var] <= analysis_date_end),
            days_paused_in_aw_var] = (res[rd_resume_date_var] - res [analysis_start_date_var]).dt.days
    # if paused and pause date <= analysis start and resume >= analysis end 
    res.loc[(res[paused_flag_in_aw_var] == 1) & 
            (pd.isna(res[rd_pause_date_var]) == 0) & 
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (res[rd_pause_date_var] <= analysis_date_start) &
            (res[rd_resume_date_var] >= analysis_date_end),
            days_paused_in_aw_var] = (res[analysis_end_date_var] - res [analysis_start_date_var]).dt.days
    # if paused and pause date <= analysis start and resume date not defined
    res.loc[(res[paused_flag_in_aw_var] == 1) & 
            (pd.isna(res[rd_pause_date_var]) == 0) & 
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (res[rd_pause_date_var] <= analysis_date_start),
            days_paused_in_aw_var] = ( res[analysis_end_date_var] - res [analysis_start_date_var]).dt.days
    # if paused and pause date >= analysis start and resume&end date not defined 
    # if analysis start <= pause date <= analysis end 
    res.loc[(res[paused_flag_in_aw_var] == 1) & 
            (pd.isna(res[rd_pause_date_var]) == 0) & 
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (pd.isna(res[rd_end_date_var]) == 1) &
            (res[rd_pause_date_var] >= analysis_date_start),
            days_paused_in_aw_var] = ( res [analysis_end_date_var] - res[rd_pause_date_var]).dt.days
    # if paused and pause date >= analysis start and resume not defined 
    # if analysis start <= pause date <= analysis end <= end date
    res.loc[(res[paused_flag_in_aw_var] == 1) & 
            (pd.isna(res[rd_pause_date_var]) == 0) & 
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_pause_date_var] >= analysis_date_start) &
            (res[rd_pause_date_var] <= analysis_date_end) & 
            (analysis_date_end <= res[rd_end_date_var]),
            days_paused_in_aw_var] = ( res [analysis_end_date_var] - res[rd_pause_date_var]).dt.days
    # if paused and pause date >= analysis start and resume not defined 
    # if analysis start <= pause date <= end date <= analysis end 
    res.loc[(res[paused_flag_in_aw_var] == 1) & 
            (pd.isna(res[rd_pause_date_var]) == 0) & 
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_pause_date_var] >= analysis_date_start) &
            (res[rd_pause_date_var] <= res[rd_end_date_var]) & 
            (res[rd_end_date_var] <= analysis_date_end ),
            days_paused_in_aw_var] = ( res [rd_end_date_var] - res[rd_pause_date_var]).dt.days

    
    ## count days in referral in analysis window
    res[days_before_streatment_start_in_aw_var] = 0
    # if start date and interview date not defined
    res.loc[(pd.isna(res[rd_start_date_var]) == 1) &
            (pd.isna(res[rd_interview_date_var]) == 1),
            days_before_streatment_start_in_aw_var] = np.nan
    # if start date defined, within analysis window and interview date not defined
    res.loc[(pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] >= analysis_date_start) &
            (res[rd_start_date_var] <= analysis_date_end) &
            (pd.isna(res[rd_interview_date_var]) == 1),
            days_before_streatment_start_in_aw_var] = np.nan
    # if start date defined, <= analysis window start and interview date not defined
    res.loc[(pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_interview_date_var]) == 1),
            days_before_streatment_start_in_aw_var] = np.nan
    # if not paused
    # if start date defined, within analysis window and interview date within analysis window
    res.loc[(res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] >= analysis_date_start) &
            (res[rd_start_date_var] <= analysis_date_end) &
            (pd.isna(res[rd_interview_date_var]) == 0) & 
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[rd_interview_date_var] - res[rd_start_date_var]).dt.days
    # if not paused
    # if start date defined, within analysis window and interview date >= analysis window end
    res.loc[(res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] >= analysis_date_start) &
            (res[rd_start_date_var] <= analysis_date_end) &
            (pd.isna(res[rd_interview_date_var]) == 0) & 
            (res[rd_interview_date_var] >= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[analysis_end_date_var] - res[rd_start_date_var]).dt.days
    # if not paused
    # if start date defined, <= analysis window start and interview date within analysis window
    res.loc[(res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_interview_date_var]) == 0) & 
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[rd_interview_date_var] - res[analysis_start_date_var]).dt.days
    # if not paused
    # if start date defined, <= analysis window start and interview date >= analysis window
    res.loc[(res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_interview_date_var]) == 0) & 
            (res[rd_interview_date_var] >= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[analysis_end_date_var] - res[analysis_start_date_var]).dt.days
    # if paused
    # if start date <= pause date <= analysis start <= analysis end  <= resume date <= interview date
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_pause_date_var] <= analysis_date_start) &
            (res[rd_resume_date_var] >= analysis_date_end),
            days_before_streatment_start_in_aw_var] = 0
    # if paused
    # if start date <= pause date <= analysis start <= analysis end , resume date not defined
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (res[rd_pause_date_var] <= analysis_date_start),
            days_before_streatment_start_in_aw_var] = 0
    
    # if paused
    # if start date <= analysis start  <= pause date <= analysis end <= resume date <= interview date 
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (analysis_date_start <= res[rd_pause_date_var]) &
            (analysis_date_end <= res[rd_resume_date_var]),
            days_before_streatment_start_in_aw_var] = (res[rd_pause_date_var] - res[analysis_start_date_var]).dt.days
    # if paused
    # if start date <= analysis start  <= pause date <= analysis end, resume date not defined, interview date defined
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (analysis_date_start <= res[rd_pause_date_var]),
            days_before_streatment_start_in_aw_var] = (res[rd_pause_date_var] - res[analysis_start_date_var]).dt.days
    # if paused
    # if analysis start <= start date <= pause date <= analysis end <= resume date <= interview date 
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (analysis_date_start <= res[rd_start_date_var]) &
            (res[rd_start_date_var] <= res[rd_pause_date_var] ) &
            (res[rd_pause_date_var] <= analysis_date_end) &
            (analysis_date_end <= res[rd_resume_date_var]),
            days_before_streatment_start_in_aw_var] = (res[rd_pause_date_var] - res[rd_start_date_var]).dt.days
    # if paused
    # if  analysis start <= start date <= pause date <= analysis end, resume date not defined, interview date defined
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (analysis_date_start <= res[rd_start_date_var]) &
            (res[rd_start_date_var] <= res[rd_pause_date_var] ) &
            (res[rd_pause_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[rd_pause_date_var] - res[rd_start_date_var]).dt.days
    # if paused
    # if  analysis start <= start date <= pause date  <= resume date <= analysis end <= interview date 
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (analysis_date_start <= res[rd_start_date_var]) &
            (res[rd_start_date_var] <= res[rd_pause_date_var] ) &
            (res[rd_resume_date_var] <= analysis_date_end) &
            (analysis_date_end <= res[rd_interview_date_var]),
            days_before_streatment_start_in_aw_var] = (res[rd_pause_date_var] - res[rd_start_date_var]).dt.days +\
                                                      (res[analysis_end_date_var] - res[rd_resume_date_var]).dt.days  
    # if paused
    # if  analysis start <= start date <= pause date  <= resume date <= analysis end, interview date not defined
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 1) &
            (analysis_date_start <= res[rd_start_date_var]) &
            (res[rd_start_date_var] <= res[rd_pause_date_var] ) &
            (res[rd_resume_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[rd_pause_date_var] - res[rd_start_date_var]).dt.days +\
                                                      (res[analysis_end_date_var] - res[rd_resume_date_var]).dt.days
    # if paused
    # if analysis start <= start date <= pause date  <= resume date <= interview date <= analysis end
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (analysis_date_start <= res[rd_start_date_var]) &
            (res[rd_start_date_var] <= res[rd_pause_date_var] ) &
            (res[rd_resume_date_var] <=  res[rd_interview_date_var]) &
            (res[rd_interview_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[rd_pause_date_var] - res[rd_start_date_var]).dt.days +\
                                                      (res[rd_interview_date_var] - res[rd_resume_date_var]).dt.days  
    # if paused
    # if analysis start <= resume date <= start date  <= interview date <= analysis end
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (analysis_date_start <= res[rd_resume_date_var]) &
            (res[rd_resume_date_var] <= res[rd_start_date_var]) &
            (res[rd_interview_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[rd_interview_date_var] - res[rd_start_date_var]).dt.days
    # if paused
    # if analysis start <= resume date <= start date <= analysis end <= interview date
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (analysis_date_start <= res[rd_resume_date_var]) &
            (res[rd_resume_date_var] <= res[rd_start_date_var]) &
            (res[rd_start_date_var] <= analysis_date_end)  &
            (analysis_date_end <= res[rd_interview_date_var]),
            days_before_streatment_start_in_aw_var] = (res[analysis_end_date_var] - res[rd_start_date_var]).dt.days
    # if paused
    # if analysis start <= resume date <= start date <= analysis end,  interview date not defined
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 1) &
            (analysis_date_start <= res[rd_resume_date_var]) &
            (res[rd_resume_date_var] <= res[rd_start_date_var]) &
            (res[rd_start_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[analysis_end_date_var] - res[rd_start_date_var]).dt.days
    # if paused
    # if analysis start <= start date <= intervew date date <= pause date <= analysis end
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (analysis_date_start <= res[rd_start_date_var]) &
            (res[rd_interview_date_var] <= res[rd_pause_date_var]) &
            (res[rd_pause_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[rd_interview_date_var] - res[rd_start_date_var]).dt.days
    # if paused
    # if start date <= analysis start <=  intervew date date <= pause date <= analysis end
    res.loc[(res[paused_flag_in_aw_var]== 1) &
            (pd.isna(res[rd_start_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (res[rd_start_date_var] <= analysis_date_start) &
            (res[rd_interview_date_var] <= res[rd_pause_date_var]) &
            (res[rd_pause_date_var] <= analysis_date_end),
            days_before_streatment_start_in_aw_var] = (res[rd_interview_date_var] - res[analysis_start_date_var]).dt.days
    

    # count days in treatment 
    res[days_in_treatment_in_aw_var] = 0
    # if not cancelled and not paused in analysis window,
    # if interview and end date defined and interview date >= analysis start and end date <= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (pd.isna(res[rd_end_date_var]) == 0) & 
            (res[rd_end_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_end_date_var] - res[rd_interview_date_var]).dt.days
    # if not cancelled and not paused in analysis window,
    # if interview and end date defined and interview date in analysis window, end date >= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (pd.isna(res[rd_end_date_var]) == 0) & 
            (res[rd_end_date_var] >= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[analysis_end_date_var] - res[rd_interview_date_var]).dt.days
    # if not cancelled and not paused in analysis window,
    # if interview and end date defined and interview date <= analysis start and end date <= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_end_date_var]) == 0) & 
            (res[rd_end_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_end_date_var] - res[analysis_start_date_var]).dt.days
    # if not cancelled and not paused in analysis window,
    # if interview and end date defined and interview date <= analysis start and end date >= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] <= analysis_date_start) &
            (pd.isna(res[rd_end_date_var]) == 0) & 
            (res[rd_end_date_var] >= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[analysis_end_date_var] - res[analysis_start_date_var]).dt.days
    # if not cancelled and not paused in analysis window, 
    # if interview date in analysis window, end date not defined
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var]== 0) &
            (pd.isna(rd_interview_date_var) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (pd.isna(rd_end_date_var)== 1),
           days_in_treatment_in_aw_var] = (res[analysis_end_date_var] - res[rd_interview_date_var]).dt.days
    # if not cancelled and not paused in analysis window, 
    # if interview date <= analysis window, end date not defined
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var]== 0) &
            (pd.isna(rd_interview_date_var) == 0) &
            (res[rd_interview_date_var] <= analysis_date_start) &
            (pd.isna(rd_end_date_var)== 1),
           days_in_treatment_in_aw_var] = (res[analysis_end_date_var] - res[analysis_start_date_var]).dt.days
    # if cancelled and not paused in analysis window and cancel date not defined
    res.loc[(res[cancellation_flag_in_aw_var] == 1) &
            (res[paused_flag_in_aw_var] == 0) &
            (pd.isna(res[rd_cancellation_date_var]) == 1),
           days_in_treatment_in_aw_var] = 0
    # if cancelled and not paused in analysis window and cancel date defined, interview date <= analysis start
    res.loc[(res[cancellation_flag_in_aw_var] == 1) &
            (res[paused_flag_in_aw_var]== 0) &
            (pd.isna(res[rd_cancellation_date_var]) == 0) &
            (res[rd_interview_date_var] <= analysis_date_start),
            days_in_treatment_in_aw_var] = (res[rd_cancellation_date_var] - res[analysis_start_date_var]).dt.days
    # if cancelled and not paused in analysis window and cancel date defined, interview date in analysis window
    res.loc[(res[cancellation_flag_in_aw_var] == 1) &
            (res[paused_flag_in_aw_var] == 0) &
            (pd.isna(res[rd_cancellation_date_var]) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_cancellation_date_var] - res[rd_interview_date_var]).dt.days
    # if paused and not cancelled in analysis window, paused & resume date defined, 
    #    paused date <= analysis start, resumed date >= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (res[rd_pause_date_var] <= analysis_date_start) &
            (res[rd_resume_date_var] >= analysis_date_end),
            days_in_treatment_in_aw_var] = 0
    # if paused and not cancelled in analysis window, paused date defined, resumed date not defined, 
    #    paused date <= analysis start
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (res[rd_pause_date_var] <= analysis_date_start),
            days_in_treatment_in_aw_var] = 0
    # if paused and not cancelled in analysis window, paused & resume date defined, 
    #    analysis start <= interview date <= paused date <= analysis end <= resumed date
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (res[rd_interview_date_var] <= res[rd_pause_date_var]) &
            (res[rd_resume_date_var] >= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[rd_interview_date_var]).dt.days
    # if paused and not cancelled in analysis window, paused date defined, resume date not defined
    #    analysis start <= interview date <= paused date <= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (res[rd_interview_date_var] <= res[rd_pause_date_var]),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[rd_interview_date_var]).dt.days
    # if paused and not cancelled in analysis window, paused & resume date defined, 
    #    analysis start <= interview date <= paused date <= resumed date <= date end <= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (res[rd_interview_date_var] <= res[rd_pause_date_var]) &
            (res[rd_resume_date_var] <= res[rd_end_date_var])&
            (res[rd_end_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[rd_interview_date_var]).dt.days + \
                                           (res[rd_end_date_var] - res[rd_resume_date_var]).dt.days
    # if paused and not cancelled in analysis window, paused & resume date defined, 
    #    analysis start <= interview date <= paused date <= resumed date <= analysis end <= date end 
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (res[rd_interview_date_var] <= res[rd_pause_date_var]) &
            (res[rd_resume_date_var] <= analysis_date_end) &
            (res[rd_end_date_var] >= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[rd_interview_date_var]).dt.days + \
                                           (res[analysis_end_date_var] - res[rd_resume_date_var]).dt.days
    # if paused and not cancelled in analysis window, paused & resume date defined, 
    #    analysis start <= interview date <= paused date <= resumed date <= analysis end,  date end undefined
    res.loc[(res[cancellation_flag_in_aw_var]== 0) &
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 1) &
            (res[rd_interview_date_var] >= analysis_date_start) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (res[rd_interview_date_var] <= res[rd_pause_date_var]) &
            (res[rd_resume_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[rd_interview_date_var]).dt.days + \
                                           (res[analysis_end_date_var] - res[rd_resume_date_var]).dt.days
    # if paused and not cancelled in analysis window, paused & resume date defined, 
    #    interview date <= analysis start <=  paused date <= analysis end <= resumed date
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] <= analysis_date_start) &
            (analysis_date_start <= res[rd_pause_date_var]) &
            (res[rd_pause_date_var] <= analysis_date_end) &
            (analysis_date_end <= res[rd_resume_date_var]),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[analysis_start_date_var]).dt.days
    # if paused and not cancelled in analysis window, paused date defined, resumed date not defined
    #    interview date <= analysis start <=  paused date <= analysis end 
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 1) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (res[rd_interview_date_var] <= analysis_date_start) &
            (analysis_date_start <= res[rd_pause_date_var]) &
            (res[rd_pause_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[analysis_start_date_var]).dt.days
    # if paused and not cancelled in analysis window
    #    interview date <= analysis start <=  paused date <= resumed date <= end date <= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_interview_date_var] <= analysis_date_start) &
            (analysis_date_start <= res[rd_pause_date_var]) &
            (res[rd_resume_date_var] <= res[rd_end_date_var]) &
            (res[rd_end_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[analysis_start_date_var]).dt.days + \
                                           (res[rd_end_date_var] - res[rd_resume_date_var]).dt.days
    # if paused and not cancelled in analysis window
    #    interview date <= analysis start <=  paused date <= resumed date <= analysis end <= end date 
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (res[rd_interview_date_var] <= analysis_date_start) &
            (analysis_date_start <= res[rd_pause_date_var]) &
            (res[rd_resume_date_var] <= analysis_date_end) &
            (analysis_date_end <= res[rd_end_date_var]),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[analysis_start_date_var]).dt.days + \
                                           (res[analysis_end_date_var] - res[rd_resume_date_var]).dt.days
    # if paused and not cancelled in analysis window
    #    interview date <= analysis start <=  paused date <= resumed date <= analysis end, end date undefined
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_pause_date_var]) == 0) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 1) &
            (res[rd_interview_date_var] <= analysis_date_start) &
            (analysis_date_start <= res[rd_pause_date_var]) &
            (res[rd_resume_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_pause_date_var] - res[analysis_start_date_var]).dt.days + \
                                           (res[analysis_end_date_var] - res[rd_resume_date_var]).dt.days
    # if paused and not cancelled in analysis window
    #    analysis start <= resumed date <= interview date <= end date <= analysis end
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (analysis_date_start <= res[rd_resume_date_var]) &
            (res[rd_resume_date_var] <= res[rd_interview_date_var]) &
            (res[rd_interview_date_var] <= res[rd_end_date_var]) &
            (res[rd_end_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[rd_end_date_var] - res[rd_interview_date_var]).dt.days
    # if paused and not cancelled in analysis window
    #    analysis start <= resumed date <= interview date  <= analysis end <= end date
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 0) &
            (analysis_date_start <= res[rd_resume_date_var]) &
            (res[rd_resume_date_var] <= res[rd_interview_date_var]) &
            (res[rd_interview_date_var] <= analysis_date_end) &
            (analysis_date_end <= res[rd_end_date_var]),
            days_in_treatment_in_aw_var] = (res[analysis_end_date_var] - res[rd_interview_date_var]).dt.days
    # if paused and not cancelled in analysis window
    #    analysis start <= resumed date <= interview date  <= analysis end, end date not defined
    res.loc[(res[cancellation_flag_in_aw_var]== 0) & 
            (res[paused_flag_in_aw_var] == 1) &
            (pd.isna(res[rd_resume_date_var]) == 0) &
            (pd.isna(res[rd_interview_date_var]) == 0) &
            (pd.isna(res[rd_end_date_var]) == 1) &
            (analysis_date_start <= res[rd_resume_date_var]) &
            (res[rd_resume_date_var] <= res[rd_interview_date_var]) &
            (res[rd_interview_date_var] <= analysis_date_end),
            days_in_treatment_in_aw_var] = (res[analysis_end_date_var] - res[rd_interview_date_var]).dt.days
    # if paused and cancelled in analysis window
                                                
    
    return res

def impute_end_phase1_jp(df):
    # impute end of phase 1 treatment in JobPath
    # 	please use rd_interview_date_var, which is the start of the phase 1 JobPath treatment

    # if this episode ends with a cancellation, the imputed end date is cancellation date 
    # if this episode ends with a pause, and no resumption, then imputed end date = pause date 
    # if this episode includes a pause, and resumption date NE "", then imputed_end = resumption date +12 months - time from start to pause
    # if no cancellation or pause flag, imputed_end = rd_interview_date_var plus 12 months 
    # if rd_interview_date_var = "" then imputed_end = Null
    res = df.copy()
    
    # Set deafault value (this also handles NULL/NAT/NAN)
    res[imputed_ph1_end_date_var] = res[rd_interview_date_var] + DateOffset(months=imputed_ph1_end_date_offset_months)
    
    # if cancellation date exists
    res.loc[pd.isna(res[rd_cancellation_date_var]) == 0,
            imputed_ph1_end_date_var] = res.loc[pd.isna(res[rd_cancellation_date_var]) == 0] [rd_cancellation_date_var]
    
    # if pause date exists
    res.loc[pd.isna(res[rd_pause_date_var]) == 0,
            imputed_ph1_end_date_var] = res.loc[pd.isna(res[rd_pause_date_var]) == 0] [rd_pause_date_var]
    
    # if resume date exists
    res.loc[pd.isna(res[rd_resume_date_var]) == 0,
            imputed_ph1_end_date_var] = res.loc[pd.isna(res[rd_resume_date_var]) == 0] [rd_resume_date_var] +\
                                        DateOffset(months=imputed_ph1_end_date_offset_months) - \
                                        (res.loc[pd.isna(res[rd_resume_date_var]) == 0][rd_pause_date_var] - \
                                         res.loc[pd.isna(res[rd_resume_date_var]) == 0][rd_interview_date_var])
    
    
    return res


def summary_post_processing(df):
    res = df.copy()
    
    res[jobpath_category_in_aw_var] = ''
    
    res.loc[res[completed_ph1_jp_before_aw_var] == 1,
            jobpath_category_in_aw_var] = 'completed jp before aw'
    
    res.loc[res[started_jp_after_aw_var] == 1,
            jobpath_category_in_aw_var] = 'started jp after aw'
    
    res.loc[(res[started_jp_after_aw_var] == 0) &
            (res[completed_ph1_jp_before_aw_var] == 0),
            jobpath_category_in_aw_var] = 'no jp before or after aw'
    
    res.loc[res[jp_started_before_aw_completed_ph1_in_aw_var] == 1,
            jobpath_category_in_aw_var] = 'started before aw completed during aw'
    
    res.loc[res[jp_started_in_aw_completed_ph1_after_aw_var] == 1,
            jobpath_category_in_aw_var] = 'started before aw finished after aw'
    
    res.loc[res[jp_ph1_completed_in_aw_firstQ_year2_var] == 1,
            jobpath_category_in_aw_var] = 'Q1 Y2 complete'
    
    res.loc[res[jp_ph1_completed_in_aw_secondQ_year2_var] == 1,
            jobpath_category_in_aw_var] = 'Q2 Y2 complete'
    
    res.loc[res[jp_ph1_completed_in_aw_thirdQ_year2_var] == 1,
            jobpath_category_in_aw_var] = 'Q3 Y2 complete'
    
    res.loc[res[jp_ph1_completed_in_aw_fourthQ_year2_var] == 1,
            jobpath_category_in_aw_var] = 'Q4 Y2 complete'
    
    res.loc[res[jp_cancelled_before_start_in_aw_var] == 1,
            jobpath_category_in_aw_var] = 'cancelled before start in aw'
    
    res.loc[res[jp_cancelled_in_aw_start_before_aw_var] == 1,
            jobpath_category_in_aw_var] = 'cancelled in aw started before aw'
    
    res.loc[res[jp_cancelled_in_aw_start_in_aw_var] == 1,
            jobpath_category_in_aw_var] = 'cancelled in aw started during aw'
    
    res.loc[res[jp_cancelled_afer_aw_start_in_aw_var] == 1,
            jobpath_category_in_aw_var] = 'cancelled post aw'
    return res

start_time = time.time()
print ('Reading Referral Data and ETL')
referral_data = pd.DataFrame()
for filename in referral_data_csvfilenames:
        referral_data = referral_data.append(pd.read_csv(filename, low_memory=False), ignore_index= True)

referral_data.rename(columns={rd_uid_var: uid_var}, inplace=True)
for elem in rd_date_fields:
    referral_data[elem] = pd.to_datetime(referral_data[elem], errors ='coerce', format= rd_date_format)

print ('Removing Duplicates in Referral Data')
referral_data = remove_dups_in_referral_data(referral_data)

elapsed_time = time.time() - start_time
print ('Elapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

start_time = time.time()
if (len(clusters_sql_tables) != len(analysis_date_end_strings)) | \
   (len(clusters_sql_tables) != len(analysis_date_start_strings)) | \
   (len(clusters_sql_tables) != len(flat_jld_sql_tables)):
    print ('Mismatched input parameters!\nPlease Check: clusters_sql_tables, flat_jld_sql_tables, analysis_date_end_strings, analysis_date_start_strings')
else:
    for i in range(len(clusters_sql_tables)):
        print('Processing table: %s' %(clusters_sql_tables[i]))
        
        analysis_date_start= dt.datetime.strptime(analysis_date_start_strings[i], analysis_dates_string_format)
        analysis_date_end = dt.datetime.strptime(analysis_date_end_strings[i], analysis_dates_string_format)
        print('\tAnalysis Start: %s' %(analysis_date_start.strftime('%d/%m/%Y')))
        print('\tAnalysis End: %s' %(analysis_date_end.strftime('%d/%m/%Y')))
        
        print ('\tCreating JP Summary')
        summary = impute_end_phase1_jp(referral_data)
        summary = summarise_data_in_aw(summary,analysis_date_start, analysis_date_end)
        summary = summarise_data_before_aw(summary, analysis_date_start)
        summary = summarise_data_after_aw(summary, analysis_date_end)     
        summary = summary_post_processing(summary)
        
        # format strings
        rd_string_fields.append(jobpath_category_in_aw_var)
        for elem in rd_string_fields:
            summary[elem] = summary[elem].astype(str)
            summary[elem] = summary[elem].apply(lambda x: re.sub('\W+',' ', x ))
            summary[elem] = summary[elem].apply(lambda x: x.encode('utf-8').strip())
        
        print ('\tRead Cluster Data: %s' %(clusters_sql_tables[i]))
        clusters = read_data_from_sql(clusters_sql_tables[i])

        print ('\tLeft Join Cluster Data and Summary')
        summary = pd.merge(clusters, summary, on=uid_var, how='left')
        
        print('\tRead Flat JLD Data: %s' %(flat_jld_sql_tables[i]))
        fjld_data = read_data_from_sql(flat_jld_sql_tables[i])
        fjld_data = fjld_data[flat_jld_selected_variables]
        
        print ('\tLeft Join Summary Data and Flat JLD Data')
        summary = pd.merge(summary,fjld_data, on=uid_var, how='left')
        
        suffix = '_P0'
        summary[jp_prefix_elegible_var+suffix] = 0
        summary.loc[summary[jp_eligible_target_var] >= jp_eligible_target_var_min_val, 
                    jp_prefix_elegible_var+suffix] = 1
        
        for p in range(number_of_periods):
            print('\t\tProcessing period: %d' %(p+1))
            
            period_start = analysis_date_start +DateOffset(months=periods_lenght_month*p)
            period_end = analysis_date_start +DateOffset(months=periods_lenght_month*(p+1))
            print('\t\tPeriod Start: %s' %(period_start.strftime('%d/%m/%Y')))
            print('\t\tPeriod End: %s' %(period_end.strftime('%d/%m/%Y')))
            
            suffix = '_P'+str(p+1)
            summary[jp_prefix_elegible_var+suffix] = 0
            summary.loc[ summary[jp_eligible_target_var] + (p+1)*periods_lenght_month >= jp_eligible_target_var_min_val, 
                        jp_prefix_elegible_var+suffix] = 1
            
            summary[jp_prefx_started_var+suffix] = 0
            summary.loc[(pd.isna(summary[jp_prefx_started_target_var]) ==0) &
                        (summary[jp_prefx_started_target_var] >= period_start) &
                        (summary[jp_prefx_started_target_var] < period_end),
                        jp_prefx_started_var+suffix ] = 1
            
            print('\t\tExtracting info from JLD: %s' %(jld_sql_table))
            summary = check_jld(summary, period_start, period_end, suffix)
            
            elapsed_time = time.time() - start_time
            print ('\t\tElapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        
        if export_to_csv:
            print('\tExport to CSV')
            summary.to_csv(csv_path+clusters_sql_tables[i]+out_suffix+'.csv', index=False)
        if upload_to_sql:
            print('\tUpload to SQL')
            upload_df_to_sql(summary, clusters_sql_tables[i]+out_suffix)
        

print ('ALL DONE')
elapsed_time = time.time() - start_time
print ('Elapsed time: '+ time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

In [6]:
summary.columns

Index(['cluster', 'ppsn', 'Amended Referral Status', 'Cancellationsubcategory',
       'Claim Office Code', 'Claim Office Name', 'Date of Cancellation',
       'Date of Interview', 'Date_paused', 'Dateresumed',
       ...
       'StartDate_P4', 'ada_code_rank_P4', 'family_flag_rank_P4',
       'hist_lls_P4', 'hist_lr_P4', 'lr_flag_P4', 'marital_status_rank_P4',
       'occupation_rank_P4', 'status_simple_P4', 'status_P4'],
      dtype='object', length=115)