In [1]:
import pandas as pd

In [2]:
INPUT_FLAGGING_DF_FILE = 'output/generate-flagging-periods.pkl'
INPUT_RECORDS_FILE = 'input/danWhyStopFlaggingRecordsWithoutDuplicateGroupsOrCodes.txt'
INPUT_DEMOGRAPHICS_FILE = 'input/danWhyStopFlaggingDemographics.txt'

OUTPUT_RELEVANT_DF_FILE = 'output/relevant_df.pkl'

WINDOW = pd.DateOffset(months=1)

In [3]:
flagging_df = pd.read_pickle(INPUT_FLAGGING_DF_FILE)
demographics_df = pd.read_csv(INPUT_DEMOGRAPHICS_FILE, index_col=['PatID'])
demographics_df.Sex.replace(['F', 'M'], [0, 1], inplace=True)
records_df = pd.read_csv(INPUT_RECORDS_FILE, index_col=['EntryDate'], parse_dates=['EntryDate'], encoding = "ISO-8859-1")
records_df = records_df.loc[records_df.index > '2009-03-30', :]
RECORDS_START_DATE, RECORDS_END_DATE = records_df.index.min(), records_df.index.max()

In [4]:
def generate_relevant_df(all_flagging_df, all_records_df):
    entry_dates = records_df.index.get_level_values('EntryDate')
    relevant_dfs = []
    for row in all_flagging_df.itertuples():
        pt, start_date, stop_date, reason = row.Index[0], row.Index[1], row.Index[2], row.ReasonStoppedFlagging
        start_age, stop_age = row.AgeAtFlagging, row.AgeAtStopFlagging
        flag_count, year_started_flagging, year_stopped_flagging = row.FlagCount, row.YearStartedFlagging, row.YearStoppedFlagging
        is_pt_and_relevant = ((pt == records_df.PatID) & (entry_dates > (start_date - WINDOW)) & (entry_dates < (stop_date + WINDOW)))
        relevant_records_df = all_records_df.loc[is_pt_and_relevant, :]
        if relevant_records_df.empty:
            continue
        readcode_text = generate_readcode_text(relevant_records_df)
        relevant_dfs.append(pd.DataFrame(
            data={'PatID': [pt], 'StartDate': [start_date], 'StopDate': [stop_date], 'Reason': [reason], 'ReadCodeText': [readcode_text]}, 
            columns=['PatID', 'StartDate', 'StopDate', 'Reason', 'ReadCodeText']
        ))
    return pd.concat(relevant_dfs, ignore_index=True)
        
def generate_readcode_text(relevant_records_df):
    text = relevant_records_df.ReadCode.to_string(header=False, index=False).splitlines()
    return ' '.join([t.strip() for t in text])

relevant_df = generate_relevant_df(flagging_df, records_df)

In [5]:
relevant_df.to_pickle(OUTPUT_RELEVANT_DF_FILE)