# Preprocessing the i2b2 data without any blinding

In [76]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [233]:
%autoreload

import os, pandas as pd, numpy as np
import pickle
import re
import sys
import glob
from tqdm import tqdm
import ast
sys.path.append('../../../')
from relation_extraction.data import utils
import nltk
from ast import literal_eval
import itertools
RESOURCE_PATH = "/data/medg/misc/geeticka/relation_extraction/i2b2"
outdir = 'pre-processed/original/'
def res(path): return os.path.join(RESOURCE_PATH, path)
from relation_extraction.data.converters.converter_i2b2 import get_filename_with_extension, \
get_filename_without_extension, get_concept_dictionary, get_dataset_dataframe, write_dataframe, read_dataframe,\
check_equality_of_written_and_read_df, write_into_txt, combine

# Read through the data

In [206]:
beth_training = res('concept_assertion_relation_training_data/beth/')
partners_training = res('concept_assertion_relation_training_data/partners/')
test_reference = res('reference_standard_for_test_data/')
test = res('test_data/')

In [207]:
df_train_beth = get_dataset_dataframe(beth_training + "concept/", beth_training + 'rel/', beth_training + 'txt/')

100%|██████████| 73/73 [00:02<00:00, 24.35it/s]


In [208]:
df_train_partners = get_dataset_dataframe(partners_training + "concept/", partners_training + "rel/", partners_training + 'txt/')

100%|██████████| 97/97 [00:02<00:00, 38.94it/s]


In [209]:
df_test = get_dataset_dataframe(test_reference + "concepts/", test_reference + "rel/", test)

100%|██████████| 256/256 [00:09<00:00, 27.51it/s]


In [210]:
len(df_train_beth)

1973

In [211]:
len(df_train_partners)

1147

In [212]:
len(df_test)

6293

Train set is significantly smaller than test set - that may be a problem

## Testing for empty entities

In [213]:
def get_empty_entity_rows(df):
    empty_entity_rows = []
    def find_empty_entity_number(row):
        metadata = row.metadata
        e1 = metadata['e1']['word_index']
        e2 = metadata['e2']['word_index']
        if not e1 or not e2:
            empty_entity_rows.append(row.row_num)
    temp_df = df.copy()
    temp_df.insert(0, 'row_num', range(0, len(temp_df)))
    temp_df.apply(find_empty_entity_number, axis=1)
    return empty_entity_rows

def get_empty_rows_array(empty_entity_rows, df):
    empty_rows_array = []
    for index in empty_entity_rows:
        e1 = df.iloc[index].e1
        e2 = df.iloc[index].e2
        original_sentence = df.iloc[index].original_sentence
        tokenized_sentence = df.iloc[index].tokenized_sentence
        metadata = df.iloc[index].metadata
        empty_rows_array.append([index, original_sentence, e1, e2, metadata, tokenized_sentence])
    new_df = pd.DataFrame(data=empty_rows_array,    # values
             columns=['index_original', 'original_sentence' , 'e1', 'e2', 'metadata', 'tokenized_sentence'])
    return empty_rows_array, new_df

In [214]:
def get_empty_vals(df):
    empty_entity_rows = get_empty_entity_rows(df)
    empty_rows_array, new_df = get_empty_rows_array(empty_entity_rows, df)
    return empty_rows_array, new_df

In [215]:
get_empty_vals(df_train_beth)

([], Empty DataFrame
 Columns: [index_original, original_sentence, e1, e2, metadata, tokenized_sentence]
 Index: [])

In [216]:
get_empty_vals(df_train_partners)

([], Empty DataFrame
 Columns: [index_original, original_sentence, e1, e2, metadata, tokenized_sentence]
 Index: [])

In [217]:
get_empty_vals(df_test)

([], Empty DataFrame
 Columns: [index_original, original_sentence, e1, e2, metadata, tokenized_sentence]
 Index: [])

## Write into CSV format

In [218]:
df_train_beth.iloc[0].metadata

{'e1': {'word': 'chest x-ray', 'word_index': [('0', '1')]},
 'e2': {'word': 'left lower lobe infiltrate', 'word_index': [('7', '10')]},
 'entity_replacement': {'0:1': 'test', '7:10': 'problem'},
 'sentence_id': '16',
 'filename': 'record-18'}

In [219]:
if not os.path.exists(res(outdir)):
    os.makedirs(res(outdir))

In [220]:
write_dataframe(df_train_beth, res(outdir + 'train_beth_original.csv'))

In [221]:
df_train_beth_copy = read_dataframe(res(outdir + 'train_beth_original.csv'))

In [222]:
# The first checks with the pd.equals method, and the other does a manual checking per column
check_equality_of_written_and_read_df(df_train_beth, df_train_beth_copy)

(True, True)

In [223]:
write_dataframe(df_train_partners, res(outdir + 'train_partners_original.csv'))

In [224]:
df_train_partners_copy = read_dataframe(res(outdir + 'train_partners_original.csv'))

In [225]:
check_equality_of_written_and_read_df(df_train_partners, df_train_partners_copy)

(True, True)

In [226]:
write_dataframe(df_test, res(outdir + 'test_original.csv'))

In [227]:
df_test_copy = read_dataframe(res(outdir + 'test_original.csv'))

In [228]:
check_equality_of_written_and_read_df(df_test, df_test_copy)

(True, True)

## Write into txt format

In [229]:
write_into_txt(df_train_beth, res(outdir + 'train_beth_original.txt'))

Unique relations: 	 ['TeRP' 'TrAP' 'PIP' 'TrCP' 'TrWP' 'TrIP' 'TeCP' 'TrNAP']


In [230]:
write_into_txt(df_train_partners, res(outdir + 'train_partners_original.txt'))

Unique relations: 	 ['TeRP' 'TrAP' 'PIP' 'TeCP' 'TrCP' 'TrNAP' 'TrIP' 'TrWP']


In [231]:
write_into_txt(df_test, res(outdir + 'test_original.txt'))

Unique relations: 	 ['TrCP' 'TeRP' 'TrAP' 'PIP' 'TrWP' 'TrNAP' 'TrIP' 'TeCP']


## Combine the train data of beth and partners

In [234]:
combine(res, outdir, 'train_beth_original', 'train_partners_original', 'train_original.txt')