In [None]:
import csv
import datetime
import pickle
import sys
import timeit

import pandas as pd

In [None]:
INPUT_IN_FILE = 'input/danWhyStopFlaggingIn.txt'
INPUT_OUT_FILE = 'input/danWhyStopFlaggingOut.txt'
PREPROCESS_OUTPUT_FILE = 'preprocess_start.txt'
TEST_SIZE = 0.2

In [None]:
script_start_time = datetime.datetime.now()
print('{} started at {}'.format(sys.argv[0], script_start_time))

In [None]:
print('Reading data...', end='')
start_time = timeit.default_timer()
in_df = pd.read_csv(INPUT_IN_FILE, parse_dates=['DateStartedFlagging'])
in_df = in_df.loc[in_df.DateStartedFlagging > '2009-03-30', :]
out_df = pd.read_csv(INPUT_OUT_FILE, parse_dates=['DateStoppedFlagging'])
out_df = out_df.loc[out_df.DateStoppedFlagging > '2009-03-30', :]
FLAGGING_START_DATE, FLAGGING_END_DATE = in_df.DateStartedFlagging.min(), out_df.DateStoppedFlagging.max()
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
def generate_flagging_periods(patid):
    df_list = []
    def append_to_df(patid, start_date, start_age, stop_date, stop_age, reason, flag_count, year_started_flagging, year_stopped_flagging):
        df_list.append(pd.DataFrame(
                {
                 'PatID': [patid], 
                 'DateStartedFlagging': [start_date], 
                 'AgeAtFlagging': [start_age],
                 'DateStoppedFlagging': [stop_date], 
                 'AgeAtStopFlagging': [stop_age],
                 'ReasonStoppedFlagging': [reason],
                 'FlagCount': [flag_count],
                 'YearStartedFlagging': [year_started_flagging],
                 'YearStoppedFlagging': [year_stopped_flagging]
                },
                columns=[
                    'PatID', 
                    'DateStartedFlagging', 
                    'AgeAtFlagging', 
                    'DateStoppedFlagging', 
                    'AgeAtStopFlagging', 
                    'ReasonStoppedFlagging', 
                    'FlagCount', 
                    'YearStartedFlagging', 
                    'YearStoppedFlagging'
                ]
            )
        )
        
    flag_count = 0
    tmp_in_df = in_df.loc[in_df.PatID == patid, :]
    tmp_out_df = out_df.loc[out_df.PatID == patid, :]
    for _, in_row in tmp_in_df.iterrows():
        start_date = in_row.DateStartedFlagging
        start_age = in_row.AgeAtFlagging
        stop_date = pd.Timestamp(FLAGGING_END_DATE)
        latest = pd.Timestamp(FLAGGING_START_DATE)
        reason = "NOT_RESOLVED"
        flag_count += 1
        year_started_flagging = in_row.DateStartedFlagging.year
        for _, out_row in tmp_out_df.iterrows():
            if out_row.DateStoppedFlagging > start_date and latest <= start_date:
                latest = stop_date = out_row.DateStoppedFlagging
                reason = out_row.ReasonStoppedFlagging
                stop_age = out_row.AgeAtStopFlagging
                year_stopped_flagging = out_row.DateStoppedFlagging.year
                continue
        append_to_df(patid, start_date, start_age, stop_date, stop_age, reason, flag_count, year_started_flagging, year_stopped_flagging)
    return pd.concat(df_list)

print('Preprocessing patients...', end='')
start_time = timeit.default_timer()
in_out_df_list = []
for patid in in_df.PatID.unique():
    in_out_df_list.append(generate_flagging_periods(patid))

in_out_df = pd.concat(in_out_df_list).reset_index(drop=True)
in_out_df.set_index(['PatID', 'DateStartedFlagging', 'DateStoppedFlagging'], inplace=True)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True )

In [None]:
display(in_out_df.sort_index())

In [None]:
in_out_df.to_pickle('output/generate-flagging-periods.pkl')