# Exploration of the i2b2 data to make sure that concepts don't span multiple lines and if they do, print

In [3]:
%load_ext autoreload

In [79]:
%autoreload

import os, random, pandas as pd, numpy as np
import pickle
import sys
import glob
from tqdm import tqdm
import ast
sys.path.append('../../')
# sys.path.append('../ddi_preprocess')
from relation_extraction.data import utils
import nltk
from collections import Counter
import itertools
from ast import literal_eval # to convert the string tuple form to an actual tuple
RESOURCE_PATH = "/data/medg/misc/geeticka/relation_extraction/i2b2"
outdir = 'pre-processed/original/'
def res(path): return os.path.join(RESOURCE_PATH, path)
from relation_extraction.data.converters.converter_i2b2 import get_dataset_dataframe, read_dataframe
from relation_extraction.data.data_exploration import get_entity_pair_dict_with_df, \
get_entity_dict_df_pair_map
#outdir = 'pre-processed/original/'

## First look at training data from beth, then partners and then go through the test data

In [5]:
beth_training_concepts = res('concept_assertion_relation_training_data/beth/concept/')

In [6]:
partners_training_concepts = res('concept_assertion_relation_training_data/partners/concept/')

In [7]:
test_concepts = res('reference_standard_for_test_data/concepts/')

In [8]:
def find_span_multiple_lines(directory):
    total_files_to_read = glob.glob(directory + '*.con')
    if not total_files_to_read:
        print("There are no files to read! Check your directory")
    for file in tqdm(total_files_to_read):
        with open(file, 'r') as read_file:
            for line in read_file:
                line = line.strip()
                first_part = line.split('||')[0]
                pos1 = first_part.split(' ')[-2]
                pos2 = first_part.split(' ')[-1]
                line1 = pos1.split(':')[0]
                line2 = pos2.split(':')[0]
                if line1 != line2:
                    print("Concept spans multiple lines", line)

In [9]:
find_span_multiple_lines(beth_training_concepts)

100%|██████████| 73/73 [00:03<00:00, 24.02it/s]


In [10]:
find_span_multiple_lines(partners_training_concepts)

100%|██████████| 97/97 [00:03<00:00, 25.34it/s]


In [11]:
find_span_multiple_lines(test_concepts)

100%|██████████| 256/256 [00:10<00:00, 24.46it/s]


Now, we have confirmed that no concept spans multiple lines. This makes the job easier. 

## Checking how much entity overlap exists between train and test

In [73]:
df_train_beth = read_dataframe(res(outdir + 'train_beth_original.csv'))

In [74]:
df_train_partners = read_dataframe(res(outdir + 'train_partners_original.csv'))

In [75]:
df_train = pd.concat([df_train_beth, df_train_partners], axis=0)

In [76]:
len(df_train) == len(df_train_beth) + len(df_train_partners)

True

In [77]:
df_test = read_dataframe(res(outdir + 'test_original.csv'))

### Getting the e1 and e2 as a tuple in a separate column and mapping the pairs to an index

In [80]:
needed_dict, df_train, df_test = get_entity_dict_df_pair_map(df_train, df_test)

### Gather statistics

In [82]:
train_pair_maps = set(list(df_train['pair_map']))
test_pair_maps = set(list(df_test['pair_map']))

In [83]:
intersecting_pairs = train_pair_maps.intersection(test_pair_maps)

Percent of train data that has the overlapping relation pairs

In [86]:
len(intersecting_pairs)/ len(train_pair_maps) * 100

6.795375218150087

Percent of test data that has overlapping relation pairs

In [85]:
len(intersecting_pairs)/ len(test_pair_maps) * 100

3.6103384330088084

Above means that out of unique pairs of train and test data, we have overlap of 623 pairs which consitutes 6% of train data and 3.6% of the test data (only in terms of unique pairs)

## In terms of what the model sees we need to do an overlap calculation of overall data (without taking out uniques)

In [26]:
train_overlaps = 0
for index, row in df_train.iterrows():
    if row['pair_map'] in intersecting_pairs:
        train_overlaps += 1

In [27]:
test_overlaps = 0
for index, row in df_test.iterrows():
    if row['pair_map'] in intersecting_pairs:
        test_overlaps += 1

In [29]:
train_overlaps/ len(df_train) * 100

11.553122861890333

In [30]:
test_overlaps/ len(df_test) * 100

7.737783823375537

### This means that the number of examples that have overlaps from train are 7.73% of the test data. This is 11.55% of the examples from the training data. This still does not explain why concept blinding performs well. Is this a low overlap to have?

In [56]:
entity_set_train = set(list(df_train['e1']) + list(df_train['e2']))

In [57]:
entity_set_test = set(list(df_test['e1']) + list(df_test['e2']))

In [58]:
entity_list_intersect = list(entity_set_train.intersection(entity_set_test))

In [59]:
train_overlaps_indiv_entity = 0
for index, row in df_train.iterrows():
    if row['e1'] in entity_list_intersect or row['e2'] in entity_list_intersect:
        train_overlaps_indiv_entity += 1

In [60]:
test_overlaps_indiv_entity = 0
for index, row in df_test.iterrows():
    if row['e1'] in entity_list_intersect or row['e2'] in entity_list_intersect:
        test_overlaps_indiv_entity += 1

In [61]:
train_overlaps_indiv_entity/ len(df_train) * 100

73.04271332225589

In [62]:
test_overlaps_indiv_entity/ len(df_test) * 100

64.49722716333578

There is high individual entity overlap in the i2b2 dataset

## Get some statistics on the data

In [31]:
train_beth_relation_distr = df_train_beth.groupby(['relation_type']).count()[['original_sentence']]

In [32]:
train_partners_relation_distr = df_train_partners.groupby(['relation_type']).count()[['original_sentence']]

In [33]:
test_relation_distr = df_test.groupby(['relation_type']).count()[['original_sentence']]

In [34]:
train_beth_relation_distr + train_partners_relation_distr + test_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,2203
PP-None,12506
TeCP,504
TeP-None,2964
TeRP,3053
TrAP,2617
TrCP,526
TrIP,203
TrNAP,174
TrP-None,4462


In [35]:
train_beth_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,519
PP-None,2464
TeCP,98
TeP-None,527
TeRP,564
TrAP,583
TrCP,117
TrIP,34
TrNAP,41
TrP-None,832


In [36]:
train_partners_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,236
PP-None,1954
TeCP,68
TeP-None,464
TeRP,429
TrAP,302
TrCP,67
TrIP,17
TrNAP,21
TrP-None,870


In [37]:
train_beth_relation_distr + train_partners_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,755
PP-None,4418
TeCP,166
TeP-None,991
TeRP,993
TrAP,885
TrCP,184
TrIP,51
TrNAP,62
TrP-None,1702


In [30]:
test_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
PIP,1448
PP-None,8088
TeCP,338
TeP-None,1973
TeRP,2060
TrAP,1732
TrCP,342
TrIP,152
TrNAP,112
TrP-None,2760
