In [1]:
data_path = 'CrossRE/crossre_data/'
topics = ['ai', 'literature', 'music', 'news', 'politics', 'science']
label2idx = {'part-of':0, 'physical':1, 'usage':2, 'role':3, 'social':4, 
             'general-affiliation':5, 'compare':6, 'temporal':7, 'artifact':8, 
             'origin':9, 'topic':10, 'opposite':11, 'cause-effect':12,
             'win-defeat':13, 'type-of':14, 'named':15, 'related-to':16}

batch_size = 32

In [2]:
import sys

import numpy as np
import pandas as pd
import torch

import preprocessing

from collections import defaultdict
from torch.utils.data import DataLoader

## Read Data

We Created our own data loader, identical to the one on the CrossRE projects, with the only difference being that we sample across all 6 domains.

In [3]:
# train_data = pd.DataFrame(columns=['doc_key', 'sentence', 'ner', 'relations'])
# dev_data = pd.DataFrame(columns=['doc_key', 'sentence', 'ner', 'relations'])
# test_data = pd.DataFrame(columns=['doc_key', 'sentence', 'ner', 'relations'])
# for t in topics:
#     train_data = pd.concat((train_data, pd.read_json(f'{data_path}{t}-train.json', lines=True)), axis=0, ignore_index=True)
#     dev_data = pd.concat((dev_data, pd.read_json(f'{data_path}{t}-dev.json', lines=True)), axis=0, ignore_index=True)
#     test_data = pd.concat((test_data, pd.read_json(f'{data_path}{t}-test.json', lines=True)), axis=0, ignore_index=True)

In [4]:
#train_data = preprocessing.prepare_data(f'{data_path}{topics[0]}-train.json', label2idx, 32)
def get_all_crossre(data_path, topics, batch_size = 32, dataset='train'):
    sentences, entities_1, entities_2, relations = [], [], [], []
    for t in topics:
        s, e_1, e_2, r = preprocessing.read_json_file(f'{data_path}{t}-{dataset}.json', label2idx)
        sentences += s
        entities_1 += e_1
        entities_2 += e_2
        relations += r

    return DataLoader(preprocessing.DatasetMapper(sentences, entities_1, entities_2, relations), batch_size=batch_size)

In [5]:
train_data = get_all_crossre(data_path, topics, batch_size)
dev_data = get_all_crossre(data_path, topics, batch_size, dataset='dev')
# test_data = get_all_crossre(data_path, topics, dataset='test')

# Baseline Predictions

Once we had our own data loader working (see above) we had to make some changes to the scripts main.py & preprocessing.py to support the functionality and extra / different parameters of our data loader (i.e. we do not query data paths for train-dev-test but instead require path to data folder + list of topics + dataset name ("train", "dev" or "test"))

We ran the following 2 commands while being within the CrossRE folder:

```bash
python main.py --exp_path predictions/cross_domain --data_path crossre_data/ -rs 4012

python main.py --exp_path predictions/cross_domain --data_path crossre_data/ -rs 4012 --prediction_only
```

The first command trains the model with default parameters (50 epochs, batch size 32 etc.)

The second command finds the test data and produces predictions.