# Preprocessing the i2b2 data without any blinding

In [2]:
%load_ext autoreload

In [32]:
%autoreload

import os, pandas as pd, numpy as np
import pickle
import re
import sys
import glob
from tqdm import tqdm
import ast
from collections import Counter
sys.path.append('../../../')
from relation_extraction.data import utils
import nltk
from ast import literal_eval
import itertools
RESOURCE_PATH = "/data/medg/misc/geeticka/relation_extraction/i2b2"
outdir = 'pre-processed/original/'
def res(path): return os.path.join(RESOURCE_PATH, path)
from relation_extraction.data.converters.converter_i2b2 import get_dataset_dataframe, read_dataframe
# from relation_extraction.data.converters.converter_i2b2 import get_filename_with_extension, \
# get_filename_without_extension, get_concept_dictionary, get_dataset_dataframe_classification, write_dataframe, read_dataframe,\
# check_equality_of_written_and_read_df, write_into_txt, combine, get_line_number_and_word_number, read_rel_line,\
# get_entity_replacement_dictionary, get_dataset_dataframe_extraction

TODO: figure out how to handle the experiments for the extraction vs classification case. Also need to update the eval script at this point. 

# Read through the data

In [305]:
beth_training = res('concept_assertion_relation_training_data/beth/')
partners_training = res('concept_assertion_relation_training_data/partners/')
test_reference = res('reference_standard_for_test_data/')
test = res('test_data/')

In [306]:
df_train_beth = get_dataset_dataframe(beth_training + "concept/", beth_training + 'rel/', beth_training + 'txt/')

  4%|▍         | 3/73 [00:00<00:06, 11.11it/s]

Message from append_existing_relations(): The relation pair  {'85:8;85:9', '85:5;85:5'} is not present in the artificial relations pair and their respective types are  treatment treatment


 82%|████████▏ | 60/73 [00:02<00:00, 20.91it/s]

Message from append_existing_relations(): The relation pair  {'74:15;74:15', '74:0;74:0'} is not present in the artificial relations pair and their respective types are  test test


100%|██████████| 73/73 [00:03<00:00, 21.74it/s]


In [307]:
df_train_partners = get_dataset_dataframe(partners_training + "concept/", partners_training + "rel/", partners_training + 'txt/')

100%|██████████| 97/97 [00:02<00:00, 37.19it/s]


In [308]:
df_test = get_dataset_dataframe(test_reference + "concepts/", test_reference + "rel/", test)

 87%|████████▋ | 223/256 [00:08<00:01, 25.10it/s]

Message from append_existing_relations(): The relation pair  {'158:0;158:4', '158:18;158:18'} is not present in the artificial relations pair and their respective types are  test treatment


100%|██████████| 256/256 [00:10<00:00, 25.39it/s]


In [309]:
len(df_train_beth)

5796

In [310]:
len(df_train_partners)

4435

In [311]:
len(df_test)

19114

Train set is significantly smaller than test set - that may be a problem

## Testing for empty entities

In [312]:
def get_empty_entity_rows(df):
    empty_entity_rows = []
    def find_empty_entity_number(row):
        metadata = row.metadata
        e1 = metadata['e1']['word_index']
        e2 = metadata['e2']['word_index']
        if not e1 or not e2:
            empty_entity_rows.append(row.row_num)
    temp_df = df.copy()
    temp_df.insert(0, 'row_num', range(0, len(temp_df)))
    temp_df.apply(find_empty_entity_number, axis=1)
    return empty_entity_rows

def get_empty_rows_array(empty_entity_rows, df):
    empty_rows_array = []
    for index in empty_entity_rows:
        e1 = df.iloc[index].e1
        e2 = df.iloc[index].e2
        original_sentence = df.iloc[index].original_sentence
        tokenized_sentence = df.iloc[index].tokenized_sentence
        metadata = df.iloc[index].metadata
        empty_rows_array.append([index, original_sentence, e1, e2, metadata, tokenized_sentence])
    new_df = pd.DataFrame(data=empty_rows_array,    # values
             columns=['index_original', 'original_sentence' , 'e1', 'e2', 'metadata', 'tokenized_sentence'])
    return empty_rows_array, new_df

In [313]:
def get_empty_vals(df):
    empty_entity_rows = get_empty_entity_rows(df)
    empty_rows_array, new_df = get_empty_rows_array(empty_entity_rows, df)
    return empty_rows_array, new_df

In [314]:
get_empty_vals(df_train_beth)

([], Empty DataFrame
 Columns: [index_original, original_sentence, e1, e2, metadata, tokenized_sentence]
 Index: [])

In [315]:
get_empty_vals(df_train_partners)

([], Empty DataFrame
 Columns: [index_original, original_sentence, e1, e2, metadata, tokenized_sentence]
 Index: [])

In [316]:
get_empty_vals(df_test)

([], Empty DataFrame
 Columns: [index_original, original_sentence, e1, e2, metadata, tokenized_sentence]
 Index: [])

## Write into CSV format

In [317]:
df_train_beth.iloc[0].metadata

{'e1': {'word': 'chest x-ray', 'word_index': [('0', '1')]},
 'e2': {'word': 'left lower lobe infiltrate', 'word_index': [('7', '10')]},
 'entity_replacement': {'0:1': 'test', '7:10': 'problem'},
 'sentence_id': '16',
 'filename': 'record-18'}

In [318]:
if not os.path.exists(res(outdir)):
    os.makedirs(res(outdir))

In [319]:
write_dataframe(df_train_beth, res(outdir + 'train_beth_original.csv'))

In [320]:
df_train_beth_copy = read_dataframe(res(outdir + 'train_beth_original.csv'))

In [321]:
# The first checks with the pd.equals method, and the other does a manual checking per column
check_equality_of_written_and_read_df(df_train_beth, df_train_beth_copy)

(True, True)

In [322]:
write_dataframe(df_train_partners, res(outdir + 'train_partners_original.csv'))

In [323]:
df_train_partners_copy = read_dataframe(res(outdir + 'train_partners_original.csv'))

In [324]:
check_equality_of_written_and_read_df(df_train_partners, df_train_partners_copy)

(True, True)

In [325]:
write_dataframe(df_test, res(outdir + 'test_original.csv'))

In [326]:
df_test_copy = read_dataframe(res(outdir + 'test_original.csv'))

In [327]:
check_equality_of_written_and_read_df(df_test, df_test_copy)

(True, True)

## Write into txt format

In [328]:
write_into_txt(df_train_beth, res(outdir + 'train_beth_original.txt'))

Unique relations: 	 ['TeRP' 'TrAP' 'PIP' 'TrCP' 'TrWP' 'TrIP' 'PP-None' 'TrP-None' 'TeP-None'
 'TeCP' 'TrNAP']


In [329]:
write_into_txt(df_train_partners, res(outdir + 'train_partners_original.txt'))

Unique relations: 	 ['TeRP' 'TrAP' 'PP-None' 'TeP-None' 'PIP' 'TeCP' 'TrP-None' 'TrCP' 'TrNAP'
 'TrIP' 'TrWP']


In [330]:
write_into_txt(df_test, res(outdir + 'test_original.txt'))

Unique relations: 	 ['TrCP' 'TeRP' 'TrAP' 'PIP' 'TrWP' 'PP-None' 'TrP-None' 'TrNAP' 'TrIP'
 'TeP-None' 'TeCP']


## Combine the train data of beth and partners

In [331]:
combine(res, outdir, 'train_beth_original', 'train_partners_original', 'train_original.txt')