# Exploration of the i2b2 data to make sure that concepts don't span multiple lines and if they do, print

In [33]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [100]:
%autoreload

import os, random, pandas as pd, numpy as np
import pickle
import sys
import glob
from tqdm import tqdm
import ast
sys.path.append('../../')
# sys.path.append('../ddi_preprocess')
from relation_extraction.data import utils
import nltk
from collections import Counter
import itertools
from ast import literal_eval # to convert the string tuple form to an actual tuple
RESOURCE_PATH = "/data/medg/misc/geeticka/relation_extraction/ddi"
outdir = 'pre-processed/original/'
def res(path): return os.path.join(RESOURCE_PATH, path)
from relation_extraction.data.converters.converter_ddi import get_dataset_dataframe, read_dataframe
from relation_extraction.data.data_exploration import get_entity_pair_dict_with_df, \
get_entity_dict_df_pair_map, length_of_context, length_of_sentence
#outdir = 'pre-processed/original/'

## Checking how much entity overlap exists between train and test

In [50]:
df_train_drugbank = read_dataframe(res(outdir + 'train_drugbank_original.csv'))

In [51]:
df_train_medline = read_dataframe(res(outdir + 'train_medline_original.csv'))

In [52]:
df_test_drugbank = read_dataframe(res(outdir + 'test_drugbank_original.csv'))

In [53]:
df_test_medline = read_dataframe(res(outdir + 'test_medline_original.csv'))

In [54]:
df_train = pd.concat([df_train_drugbank, df_train_medline], axis=0)

In [55]:
len(df_train) == len(df_train_drugbank) + len(df_train_medline)

True

In [56]:
df_test = pd.concat([df_test_drugbank, df_test_medline], axis=0)

In [57]:
len(df_test) == len(df_test_drugbank) + len(df_test_medline)

True

### Getting the e1 and e2 as a tuple in a separate column and mapping the pairs to an index

In [58]:
needed_dict, df_train, df_test = get_entity_dict_df_pair_map(df_train, df_test)

### Gather statistics

In [59]:
train_pair_maps = set(list(df_train['pair_map']))
test_pair_maps = set(list(df_test['pair_map']))

In [60]:
intersecting_pairs = train_pair_maps.intersection(test_pair_maps)

Percent of train data that has the overlapping relation pairs

In [61]:
len(intersecting_pairs)/ len(train_pair_maps) * 100

6.5292359567782965

Percent of test data that has overlapping relation pairs

In [62]:
len(intersecting_pairs)/ len(test_pair_maps) * 100

25.319465081723624

Above means that out of unique pairs of train and test data, we have overlap of 852 pairs which consitutes 6.5% of train data and 25.3% of the test data (only in terms of unique pairs)

## In terms of what the model sees we need to do an overlap calculation of overall data (without taking out uniques)

In [63]:
train_overlaps = 0
for index, row in df_train.iterrows():
    if row['pair_map'] in intersecting_pairs:
        train_overlaps += 1

In [64]:
test_overlaps = 0
for index, row in df_test.iterrows():
    if row['pair_map'] in intersecting_pairs:
        test_overlaps += 1

In [65]:
train_overlaps/ len(df_train) * 100

13.78514576366976

In [66]:
test_overlaps/ len(df_test) * 100

28.598807495741056

### This means that the number of examples that have overlaps from train are 28.59% of the test data. This is 13.78% of the examples from the training data. This still does not explain why concept blinding performs well for i2b2 but not for DDI.

In [67]:
entity_set_train = set(list(df_train['e1']) + list(df_train['e2']))

In [68]:
entity_set_test = set(list(df_test['e1']) + list(df_test['e2']))

In [69]:
entity_list_intersect = list(entity_set_train.intersection(entity_set_test))

In [70]:
train_overlaps_indiv_entity = 0
for index, row in df_train.iterrows():
    if row['e1'] in entity_list_intersect or row['e2'] in entity_list_intersect:
        train_overlaps_indiv_entity += 1

In [71]:
test_overlaps_indiv_entity = 0
for index, row in df_test.iterrows():
    if row['e1'] in entity_list_intersect or row['e2'] in entity_list_intersect:
        test_overlaps_indiv_entity += 1

In [72]:
train_overlaps_indiv_entity/ len(df_train) * 100

74.69034050770028

In [73]:
test_overlaps_indiv_entity/ len(df_test) * 100

91.24787052810903

In [101]:
df_train['context'] = df_train.apply(length_of_context, axis=1)
df_train['sentence'] = df_train.apply(length_of_sentence, axis=1)
df_test['context'] = df_test.apply(length_of_context, axis=1)
df_test['sentence'] = df_test.apply(length_of_sentence, axis=1)

In [102]:
np.mean(list(df_train['context']))

15.139735317665897

In [103]:
np.mean(list(df_test['context']))

15.201235093696763

In [104]:
np.mean(list(df_train['sentence']))

48.620826072622805

In [105]:
np.mean(list(df_test['sentence']))

47.6616269165247

### Average length of sentence is 48 and average length of context is 15 words

## Get some statistics on the data

In [75]:
train_medline_relation_distr = df_train_medline.groupby(['relation_type']).count()[['original_sentence']]

In [76]:
train_drugbank_relation_distr = df_train_drugbank.groupby(['relation_type']).count()[['original_sentence']]

In [77]:
test_medline_relation_distr = df_test_medline.groupby(['relation_type']).count()[['original_sentence']]

In [78]:
test_drugbank_relation_distr = df_test_drugbank.groupby(['relation_type']).count()[['original_sentence']]

In [79]:
train_medline_relation_distr + train_drugbank_relation_distr + test_medline_relation_distr + test_drugbank_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
advise,863
effect,1591
int,228
mechanism,1299
none,21948


In [84]:
train_medline_relation_distr + train_drugbank_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
advise,696
effect,1308
int,146
mechanism,1054
none,18029


In [85]:
test_medline_relation_distr + test_drugbank_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
advise,167
effect,283
int,82
mechanism,245
none,3919


In [80]:
train_medline_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
advise,8
effect,130
int,9
mechanism,52
none,1347


In [81]:
train_drugbank_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
advise,688
effect,1178
int,137
mechanism,1002
none,16682


In [82]:
test_drugbank_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
advise,160
effect,232
int,80
mechanism,224
none,3612


In [83]:
test_medline_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
advise,7
effect,51
int,2
mechanism,21
none,307
