# Exploration of the semeval2010 data to make sure that concepts don't span multiple lines and if they do, print

In [1]:
%load_ext autoreload

In [25]:
%autoreload

import os, random, pandas as pd, numpy as np
import pickle
import sys
import glob
from tqdm import tqdm
import ast
sys.path.append('../../')
# sys.path.append('../ddi_preprocess')
from relation_extraction.data import utils
import nltk
from collections import Counter
import itertools
from ast import literal_eval # to convert the string tuple form to an actual tuple
RESOURCE_PATH = "/data/medg/misc/geeticka/relation_extraction/semeval2010"
outdir = 'pre-processed/original/'
def res(path): return os.path.join(RESOURCE_PATH, path)
from relation_extraction.data.converters.converter_semeval2010 import get_dataset_dataframe, read_dataframe
from relation_extraction.data.data_exploration import get_entity_pair_dict_with_df, \
get_entity_dict_df_pair_map
#outdir = 'pre-processed/original/'

[nltk_data] Downloading package wordnet to
[nltk_data]     /afs/csail.mit.edu/u/g/geeticka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Checking how much entity overlap exists between train and test

In [26]:
df_train = read_dataframe(res(outdir + 'train_original.csv'))

In [27]:
df_test = read_dataframe(res(outdir + 'test_original.csv'))

### Getting the e1 and e2 as a tuple in a separate column and mapping the pairs to an index

In [28]:
needed_dict, df_train, df_test = get_entity_dict_df_pair_map(df_train, df_test)

### Gather statistics

In [29]:
train_pair_maps = set(list(df_train['pair_map']))
test_pair_maps = set(list(df_test['pair_map']))

In [30]:
intersecting_pairs = train_pair_maps.intersection(test_pair_maps)

Percent of train data that has the overlapping relation pairs

In [31]:
len(intersecting_pairs)/ len(train_pair_maps) * 100

2.6167735182519953

Percent of test data that has overlapping relation pairs

In [33]:
len(intersecting_pairs)/ len(test_pair_maps) * 100

7.487832272557095

Above means that out of unique pairs of train and test data, we have overlap of 200 pairs which consitutes 2.6% of train data and 7.4% of the test data (only in terms of unique pairs)

## In terms of what the model sees we need to do an overlap calculation of overall data (without taking out uniques)

In [34]:
train_overlaps = 0
for index, row in df_train.iterrows():
    if row['pair_map'] in intersecting_pairs:
        train_overlaps += 1

In [35]:
test_overlaps = 0
for index, row in df_test.iterrows():
    if row['pair_map'] in intersecting_pairs:
        test_overlaps += 1

In [37]:
train_overlaps/ len(df_train) * 100

3.5374999999999996

In [38]:
test_overlaps/ len(df_test) * 100

8.428413691571587

### This means that the number of test examples that have overlaps from train are 8.5% of the test data. This is 3.5% of the examples from the training data. This might explain why concept blinding does not perform well because there isn't much overlap in the first place, compared to i2b2. 

## Get some statistics on the data

In [39]:
train_relation_distr = df_train.groupby(['relation_type']).count()[['original_sentence']]

In [40]:
test_relation_distr = df_train.groupby(['relation_type']).count()[['original_sentence']]

In [41]:
train_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
"Cause-Effect(e1,e2)",344
"Cause-Effect(e2,e1)",659
"Component-Whole(e1,e2)",470
"Component-Whole(e2,e1)",471
"Content-Container(e1,e2)",374
"Content-Container(e2,e1)",166
"Entity-Destination(e1,e2)",844
"Entity-Destination(e2,e1)",1
"Entity-Origin(e1,e2)",568
"Entity-Origin(e2,e1)",148


In [42]:
test_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
"Cause-Effect(e1,e2)",344
"Cause-Effect(e2,e1)",659
"Component-Whole(e1,e2)",470
"Component-Whole(e2,e1)",471
"Content-Container(e1,e2)",374
"Content-Container(e2,e1)",166
"Entity-Destination(e1,e2)",844
"Entity-Destination(e2,e1)",1
"Entity-Origin(e1,e2)",568
"Entity-Origin(e2,e1)",148


In [43]:
train_relation_distr + test_relation_distr

Unnamed: 0_level_0,original_sentence
relation_type,Unnamed: 1_level_1
"Cause-Effect(e1,e2)",688
"Cause-Effect(e2,e1)",1318
"Component-Whole(e1,e2)",940
"Component-Whole(e2,e1)",942
"Content-Container(e1,e2)",748
"Content-Container(e2,e1)",332
"Entity-Destination(e1,e2)",1688
"Entity-Destination(e2,e1)",2
"Entity-Origin(e1,e2)",1136
"Entity-Origin(e2,e1)",296
