In [None]:
import csv
import datetime
import pickle
import sys
import timeit

import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
INPUT_IN_FILE = 'input/danExtractIn.txt'
INPUT_OUT_FILE = 'input/danExtractOut.txt'
TEST_SIZE = 0.2

In [None]:
script_start_time = datetime.datetime.now()
print('{} started at {}'.format(sys.argv[0], script_start_time))

In [None]:
# Read data
print('Reading data...', end='')
start_time = timeit.default_timer()
in_df = pd.read_csv(INPUT_IN_FILE, index_col=['EntryDate'], parse_dates=['EntryDate'])
out_df = pd.read_csv(INPUT_OUT_FILE, index_col=['EntryDate'], parse_dates=['EntryDate'])
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

# Replace all non-zero values with True
in_df.CodeValue = True
out_df.CodeValue = True

# Remove biochemistry Read codes, making it more suitable for Apriori
in_df = in_df.loc[~in_df.ReadCode.str.startswith('4')]
out_df = out_df.loc[~out_df.ReadCode.str.startswith('4')]

In [None]:
# Pivot the tables of the main DataFrames so that Read codes are on the x-axis and each row represents a patient record
print('Pivoting table and filling missing values...', end='')
start_time = timeit.default_timer()
in_pivot_df = in_df.pivot_table(index='PatID', columns='ReadCode', values='CodeValue')
out_pivot_df = out_df.pivot_table(index='PatID', columns='ReadCode', values='CodeValue')

# Add label column
in_pivot_df['label'] = True
out_pivot_df['label'] = False

# Reset index and add autoincrementing ID column
in_pivot_df.reset_index(inplace=True)
out_pivot_df.reset_index(inplace=True)

# Merge the in and out DataFrames
merged_df = pd.concat([in_pivot_df, out_pivot_df])

# Rename the index and add PatID to it
merged_df.reset_index(drop=True, inplace=True)
merged_df.index.names = ['ID']
merged_df.set_index('PatID', append=True, inplace=True)

# Shuffle the DataFrames
merged_df = merged_df.sample(frac=1)

# Replace NaN values with False
merged_df.fillna(value=False, inplace=True)

print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

# Print some stats
print('DataFrame info:')
merged_df.info()
print('In DataFrame shape: {} rows, {} columns'.format(*in_pivot_df.shape))
print('Out DataFrame shape: {} rows, {} columns'.format(*out_pivot_df.shape))

In [None]:
# Get a list of all patients for train-test splitting
print('Performing train-test splitting...', end='')
start_time = timeit.default_timer()
# TODO: Once we've merged DataFrames there will be duplicate PatIDs, so do the train-test split before this happens
all_pts = pd.unique(merged_df.index.get_level_values('PatID').values)

# 80% split for training, 20% split for testing
train_pts, test_pts = train_test_split(all_pts, test_size=TEST_SIZE)

is_train_pt = merged_df.index.get_level_values('PatID').isin(train_pts)
is_test_pt = merged_df.index.get_level_values('PatID').isin(test_pts)

X_train = merged_df[is_train_pt]
X_test = merged_df[is_test_pt]

# Split features and labels
y_train, y_test = X_train.pop('label'), X_test.pop('label')
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
# Write to output files
X_train.to_pickle('X_train_without_biochem.pkl')
X_test.to_pickle('X_test_without_biochem.pkl')

y_train.to_pickle('y_train_without_biochem.pkl')
y_test.to_pickle('y_test_without_biochem.pkl')