# Exploration of the i2b2 data to make sure that concepts don't span multiple lines and if they do, print

In [2]:
%load_ext autoreload

In [11]:
%autoreload

import os, random, pandas as pd, numpy as np
import pickle
import sys
import glob
from tqdm import tqdm
import ast
sys.path.append('../../')
# sys.path.append('../ddi_preprocess')
from relation_extraction.data import utils
import nltk
from collections import Counter
import itertools
from ast import literal_eval # to convert the string tuple form to an actual tuple
RESOURCE_PATH = "/data/medg/misc/geeticka/relation_extraction/i2b2"
outdir = 'pre-processed/original/'
def res(path): return os.path.join(RESOURCE_PATH, path)
from relation_extraction.data.converters.converter_i2b2 import get_dataset_dataframe, read_dataframe
#outdir = 'pre-processed/original/'

## First look at training data from beth, then partners and then go through the test data

In [11]:
beth_training_concepts = res('concept_assertion_relation_training_data/beth/concept/')

In [12]:
partners_training_concepts = res('concept_assertion_relation_training_data/partners/concept/')

In [13]:
test_concepts = res('reference_standard_for_test_data/concepts/')

In [14]:
def find_span_multiple_lines(directory):
    total_files_to_read = glob.glob(directory + '*.con')
    if not total_files_to_read:
        print("There are no files to read! Check your directory")
    for file in tqdm(total_files_to_read):
        with open(file, 'r') as read_file:
            for line in read_file:
                line = line.strip()
                first_part = line.split('||')[0]
                pos1 = first_part.split(' ')[-2]
                pos2 = first_part.split(' ')[-1]
                line1 = pos1.split(':')[0]
                line2 = pos2.split(':')[0]
                if line1 != line2:
                    print("Concept spans multiple lines", line)

In [15]:
find_span_multiple_lines(beth_training_concepts)

100%|██████████| 73/73 [00:00<00:00, 195.91it/s]


In [16]:
find_span_multiple_lines(partners_training_concepts)

100%|██████████| 97/97 [00:00<00:00, 203.45it/s]


In [17]:
find_span_multiple_lines(test_concepts)

100%|██████████| 256/256 [00:01<00:00, 200.08it/s]


Now, we have confirmed that no concept spans multiple lines. This makes the job easier. 

## Checking how much entity overlap exists between train and test

In [4]:
df_train_beth = read_dataframe(res(outdir + 'train_beth_original.csv'))

In [5]:
df_train_partners = read_dataframe(res(outdir + 'train_partners_original.csv'))

In [6]:
df_train = pd.concat([df_train_beth, df_train_partners], axis=0)

In [7]:
len(df_train) == len(df_train_beth) + len(df_train_partners)

True

In [8]:
df_test = read_dataframe(res(outdir + 'test_original.csv'))

### Getting the e1 and e2 as a tuple in a separate column

In [9]:
def get_dict_with_df(df, dict_of_e1_e2):
    def combine_e1_e2(row):
        e1 = row.e1.lower()
        e2 = row.e2.lower()
        pair = (e1, e2)
        return pair
    df['e1_and_e2'] = df.apply(combine_e1_e2, axis=1)
    unique_pairs = list(df['e1_and_e2'])

    for pair in unique_pairs:
        if pair in dict_of_e1_e2:
            dict_of_e1_e2[pair] += 1
        elif (pair[1], pair[0]) in dict_of_e1_e2:
            dict_of_e1_e2[(pair[1], pair[0])] += 1
        else:
            dict_of_e1_e2[pair] += 1
    return df, dict_of_e1_e2

In [12]:
dict_of_e1_e2 = Counter()
df_train, dict_of_e1_and_e2 = get_dict_with_df(df_train, dict_of_e1_e2)
df_test, dict_of_e1_and_e2 = get_dict_with_df(df_test, dict_of_e1_e2)

### Mapping the pairs to an index

In [13]:
ls = dict_of_e1_e2.most_common()
needed_dict = {w[0]: index for (index, w) in enumerate(ls)}

def convert_pair_to_dict(row, needed_dict):
    pair = row['e1_and_e2']
    if pair in needed_dict:
        pair_idx = needed_dict[pair]
    elif (pair[1], pair[0]) in needed_dict:
        pair_idx = needed_dict[(pair[1], pair[0])]
    else:
        print('This scenario should not have happened')
    return pair_idx
df_train['pair_map'] = df_train.apply(convert_pair_to_dict, args=(needed_dict,), axis=1)
df_test['pair_map'] = df_test.apply(convert_pair_to_dict, args=(needed_dict,), axis=1)

In [14]:
train_pair_maps = set(list(df_train['pair_map']))
test_pair_maps = set(list(df_test['pair_map']))

In [15]:
intersecting_pairs = train_pair_maps.intersection(test_pair_maps)

In [16]:
len(intersecting_pairs)

623

In [17]:
len(train_pair_maps)

9168

In [18]:
len(test_pair_maps)

17256

Above means that out of unique pairs of train and test data, we have overlap of 623 pairs which consitutes 6% of train data and 3.6% of the test data (only in terms of unique pairs)

## In terms of what the model sees we need to do an overlap calculation of overall data (without taking out uniques)

In [19]:
train_overlaps = 0
for index, row in df_train.iterrows():
    if row['pair_map'] in intersecting_pairs:
        train_overlaps += 1

In [20]:
test_overlaps = 0
for index, row in df_test.iterrows():
    if row['pair_map'] in intersecting_pairs:
        test_overlaps += 1

In [21]:
train_overlaps

1182

In [22]:
test_overlaps

1479

### This means that the number of examples that have overlaps from train are 8.5% of the test data. This is 12.8% of the examples from the training data. This still does not explain why concept blinding performs well. Is this a low overlap to have?

## Get some statistics on the data

In [23]:
train_beth_relation_distr = df_train_beth.groupby(['relation_type']).count()[['original_sentence']]

In [24]:
train_partners_relation_distr = df_train_partners.groupby(['relation_type']).count()[['original_sentence']]

In [25]:
test_relation_distr = df_test.groupby(['relation_type']).count()[['original_sentence']]

In [26]:
train_beth_relation_distr + train_partners_relation_distr + test_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,2203
PP-None,12506
TeCP,504
TeP-None,2964
TeRP,3053
TrAP,2617
TrCP,526
TrIP,203
TrNAP,174
TrP-None,4462


In [27]:
train_beth_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,519
PP-None,2464
TeCP,98
TeP-None,527
TeRP,564
TrAP,583
TrCP,117
TrIP,34
TrNAP,41
TrP-None,832


In [28]:
train_partners_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,236
PP-None,1954
TeCP,68
TeP-None,464
TeRP,429
TrAP,302
TrCP,67
TrIP,17
TrNAP,21
TrP-None,870


In [29]:
train_beth_relation_distr + train_partners_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,755
PP-None,4418
TeCP,166
TeP-None,991
TeRP,993
TrAP,885
TrCP,184
TrIP,51
TrNAP,62
TrP-None,1702


In [30]:
test_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,1448
PP-None,8088
TeCP,338
TeP-None,1973
TeRP,2060
TrAP,1732
TrCP,342
TrIP,152
TrNAP,112
TrP-None,2760
