# Exploration of the semeval2010 data to make sure that concepts don't span multiple lines and if they do, print

In [1]:
%load_ext autoreload

In [5]:
%autoreload

import os, random, pandas as pd, numpy as np
import pickle
import sys
import glob
from tqdm import tqdm
import ast
sys.path.append('../../')
# sys.path.append('../ddi_preprocess')
from relation_extraction.data import utils
import nltk
from collections import Counter
import itertools
from ast import literal_eval # to convert the string tuple form to an actual tuple
RESOURCE_PATH = "/data/medg/misc/geeticka/relation_extraction/semeval2010"
outdir = 'pre-processed/original/'
def res(path): return os.path.join(RESOURCE_PATH, path)
from relation_extraction.data.converters.converter_semeval2010 import get_dataset_dataframe, read_dataframe
#outdir = 'pre-processed/original/'

## Checking how much entity overlap exists between train and test

In [6]:
df_train = read_dataframe(res(outdir + 'train_original.csv'))

In [7]:
df_test = read_dataframe(res(outdir + 'test_original.csv'))

### Getting the e1 and e2 as a tuple in a separate column

In [8]:
def get_dict_with_df(df, dict_of_e1_e2):
    def combine_e1_e2(row):
        e1 = row.e1.lower()
        e2 = row.e2.lower()
        pair = (e1, e2)
        return pair
    df['e1_and_e2'] = df.apply(combine_e1_e2, axis=1)
    unique_pairs = list(df['e1_and_e2'])

    for pair in unique_pairs:
        if pair in dict_of_e1_e2:
            dict_of_e1_e2[pair] += 1
        elif (pair[1], pair[0]) in dict_of_e1_e2:
            dict_of_e1_e2[(pair[1], pair[0])] += 1
        else:
            dict_of_e1_e2[pair] += 1
    return df, dict_of_e1_e2

In [9]:
dict_of_e1_e2 = Counter()
df_train, dict_of_e1_and_e2 = get_dict_with_df(df_train, dict_of_e1_e2)
df_test, dict_of_e1_and_e2 = get_dict_with_df(df_test, dict_of_e1_e2)

### Mapping the pairs to an index

In [10]:
ls = dict_of_e1_e2.most_common()
needed_dict = {w[0]: index for (index, w) in enumerate(ls)}

def convert_pair_to_dict(row, needed_dict):
    pair = row['e1_and_e2']
    if pair in needed_dict:
        pair_idx = needed_dict[pair]
    elif (pair[1], pair[0]) in needed_dict:
        pair_idx = needed_dict[(pair[1], pair[0])]
    else:
        print('This scenario should not have happened')
    return pair_idx
df_train['pair_map'] = df_train.apply(convert_pair_to_dict, args=(needed_dict,), axis=1)
df_test['pair_map'] = df_test.apply(convert_pair_to_dict, args=(needed_dict,), axis=1)

In [11]:
train_pair_maps = set(list(df_train['pair_map']))
test_pair_maps = set(list(df_test['pair_map']))

In [12]:
intersecting_pairs = train_pair_maps.intersection(test_pair_maps)

In [13]:
len(intersecting_pairs)

200

In [14]:
len(train_pair_maps)

7643

In [15]:
len(test_pair_maps)

2671

Above means that out of unique pairs of train and test data, we have overlap of 200 pairs which consitutes 2.6% of train data and 7.4% of the test data (only in terms of unique pairs)

## In terms of what the model sees we need to do an overlap calculation of overall data (without taking out uniques)

In [16]:
train_overlaps = 0
for index, row in df_train.iterrows():
    if row['pair_map'] in intersecting_pairs:
        train_overlaps += 1

In [17]:
test_overlaps = 0
for index, row in df_test.iterrows():
    if row['pair_map'] in intersecting_pairs:
        test_overlaps += 1

In [18]:
train_overlaps

283

In [19]:
test_overlaps

229

### This means that the number of test examples that have overlaps from train are 8.5% of the test data. This is 3.7% of the examples from the training data. This might explain why concept blinding does not perform well because there isn't much overlap in the first place, compared to i2b2. 

## Get some statistics on the data

In [23]:
# train_beth_relation_distr = df_train_beth.groupby(['relation_type']).count()[['original_sentence']]

In [24]:
# train_partners_relation_distr = df_train_partners.groupby(['relation_type']).count()[['original_sentence']]

In [25]:
# test_relation_distr = df_test.groupby(['relation_type']).count()[['original_sentence']]

In [20]:
# train_beth_relation_distr + train_partners_relation_distr + test_relation_distr

In [21]:
# train_beth_relation_distr

In [22]:
# train_partners_relation_distr

In [23]:
# train_beth_relation_distr + train_partners_relation_distr

In [24]:
# test_relation_distr