In [None]:
from collections import defaultdict
import datetime
import pickle
import sys
import timeit

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
INPUT_FLAGGING_DF_FILE = 'output/generate-flagging-periods.pkl'
INPUT_RECORDS_FILE = 'input/danWhyStopFlaggingRecordsWithoutDuplicateGroupsOrCodes.txt'
INPUT_DEMOGRAPHICS_FILE = 'input/danWhyStopFlaggingDemographics.txt'

OUTPUT_TRAIN_DF_FILE = 'output/train-df.pkl'
OUTPUT_TEST_DF_FILE = 'output/test-df.pkl'
OUTPUT_TOK_FILE = 'output/tok.pkl'
OUTPUT_X_TRAIN_FILE = 'output/encoded-X-train.pkl'
OUTPUT_X_TEST_FILE = 'output/encoded-X-test.pkl'
OUTPUT_Y_TRAIN_FILE = 'output/encoded-y-train.pkl'
OUTPUT_Y_TEST_FILE = 'output/encoded-y-test.pkl'

WINDOW = pd.DateOffset(months=1)
TEST_SIZE = 0.2
TIMESTEPS = 200

In [None]:
script_start_time = datetime.datetime.now()
print('{} started at {}'.format(sys.argv[0], script_start_time))

In [None]:
print('Reading data...', end='')
start_time = timeit.default_timer()
flagging_df = pd.read_pickle(INPUT_FLAGGING_DF_FILE)
demographics_df = pd.read_csv(INPUT_DEMOGRAPHICS_FILE, index_col=['PatID'])
demographics_df.Sex.replace(['F', 'M'], [0, 1], inplace=True)
records_df = pd.read_csv(INPUT_RECORDS_FILE, index_col=['EntryDate'], parse_dates=['EntryDate'], encoding = "ISO-8859-1")
records_df = records_df.loc[records_df.index > '2009-03-30', :]
RECORDS_START_DATE, RECORDS_END_DATE = records_df.index.min(), records_df.index.max()
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
def generate_relevant_df(all_flagging_df, all_records_df):
    entry_dates = records_df.index.get_level_values('EntryDate')
    relevant_dfs = []
    for row in all_flagging_df.itertuples():
        pt, start_date, stop_date, reason = row.Index[0], row.Index[1], row.Index[2], row.ReasonStoppedFlagging
        start_age, stop_age = row.AgeAtFlagging, row.AgeAtStopFlagging
        flag_count, year_started_flagging, year_stopped_flagging = row.FlagCount, row.YearStartedFlagging, row.YearStoppedFlagging
        is_pt_and_relevant = ((pt == records_df.PatID) & (entry_dates > (start_date - WINDOW)) & (entry_dates < (stop_date + WINDOW)))
        relevant_records_df = all_records_df.loc[is_pt_and_relevant, :]
        if relevant_records_df.empty:
            continue
        readcode_text = generate_readcode_text(relevant_records_df)
        relevant_dfs.append(pd.DataFrame(
            data={'PatID': [pt], 'StartDate': [start_date], 'StopDate': [stop_date], 'Reason': [reason], 'ReadCodeText': [readcode_text]}, 
            columns=['PatID', 'StartDate', 'StopDate', 'Reason', 'ReadCodeText']
        ))
    return pd.concat(relevant_dfs, ignore_index=True)
        
def generate_readcode_text(relevant_records_df):
    text = relevant_records_df.ReadCode.to_string(header=False, index=False).splitlines()
    text = [t.strip() for t in text]
    text = ' '.join(text)
    return text

In [None]:
print('Generating relevant DataFrame...', end='')
start_time = timeit.default_timer()
relevant_df = generate_relevant_df(flagging_df, records_df)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
print('Tokenising...', end='')
start_time = timeit.default_timer()
relevant_texts = relevant_df.ReadCodeText
tok = Tokenizer(filters='', lower=False)
tok.fit_on_texts(relevant_texts)
VOCAB_SIZE = len(tok.word_index) + 1
relevant_sequences = tok.texts_to_sequences(relevant_texts)
padded_sequences = pad_sequences(relevant_sequences, maxlen=TIMESTEPS)
relevant_df['ReadCodeSequence'] = [x for x in padded_sequences]
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
print('Creating training and testing sets...', end='')
start_time = timeit.default_timer()
train_df, test_df, y_train, y_test = train_test_split(
    relevant_df, 
    relevant_df.pop('Reason'), 
    test_size=TEST_SIZE, 
    random_state=1337
)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
print('Preparing data...', end='')
start_time = timeit.default_timer()
X_train = np.array([np.array(x) for x in train_df.ReadCodeSequence])
X_test = np.array([np.array(x) for x in test_df.ReadCodeSequence])
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
y_test

In [None]:
train_df.to_pickle(OUTPUT_TRAIN_DF_FILE)
test_df.to_pickle(OUTPUT_TEST_DF_FILE)
with open(OUTPUT_TOK_FILE, 'wb') as handle:
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)
X_train.dump(OUTPUT_X_TRAIN_FILE)
X_test.dump(OUTPUT_X_TEST_FILE)
y_train.to_pickle(OUTPUT_Y_TRAIN_FILE)
y_test.to_pickle(OUTPUT_Y_TEST_FILE)