In [1]:
data_path = 'CrossRE/crossre_data/'
topics = ['ai', 'literature', 'music', 'news', 'politics', 'science']
label2idx = {'part-of':0, 'physical':1, 'usage':2, 'role':3, 'social':4, 
             'general-affiliation':5, 'compare':6, 'temporal':7, 'artifact':8, 
             'origin':9, 'topic':10, 'opposite':11, 'cause-effect':12,
             'win-defeat':13, 'type-of':14, 'named':15, 'related-to':16}

batch_size = 32

In [2]:
import sys

import numpy as np
import pandas as pd
import torch

import preprocessing

from collections import defaultdict
from torch.utils.data import DataLoader

## Read Data

In [3]:
train_data = pd.DataFrame(columns=['doc_key', 'sentence', 'ner', 'relations'])
dev_data = pd.DataFrame(columns=['doc_key', 'sentence', 'ner', 'relations'])
test_data = pd.DataFrame(columns=['doc_key', 'sentence', 'ner', 'relations'])
for t in topics:
    train_data = pd.concat((train_data, pd.read_json(f'{data_path}{t}-train.json', lines=True)), axis=0, ignore_index=True)
    dev_data = pd.concat((dev_data, pd.read_json(f'{data_path}{t}-dev.json', lines=True)), axis=0, ignore_index=True)
    test_data = pd.concat((test_data, pd.read_json(f'{data_path}{t}-test.json', lines=True)), axis=0, ignore_index=True)

In [4]:
#train_data = preprocessing.prepare_data(f'{data_path}{topics[0]}-train.json', label2idx, 32)
def get_all_crossre(data_path, topics, batch_size = 32, dataset='train'):
    sentences, entities_1, entities_2, relations = [], [], [], []
    for t in topics:
        s, e_1, e_2, r = preprocessing.read_json_file(f'{data_path}{t}-{dataset}.json', label2idx)
        sentences += s
        entities_1 += e_1
        entities_2 += e_2
        relations += r

    return DataLoader(preprocessing.DatasetMapper(sentences, entities_1, entities_2, relations), batch_size=batch_size)

def run(classifier, criterion, optimizer, dataset, mode='train', return_predictions=False):
    stats = defaultdict(list)

    # set model to training mode
    if mode == 'train':
        classifier.train()
    # set model to eval mode
    elif mode == 'eval':
        classifier.eval()

    # iterate over batches
    batch_idx = 0
    for sentences, entities_1, entities_2, labels in dataset:
        batch_idx += 1

        # when training, perform both forward and backward pass
        if mode == 'train':
            # zero out previous gradients
            optimizer.zero_grad()

            # forward pass
            predictions = classifier(list(sentences), entities_1, entities_2)

            # compute loss
            loss = criterion(predictions['flat_logits'], labels)

            # propagate loss
            loss.backward()
            optimizer.step()

        # when evaluating, perform forward pass without gradients
        elif mode == 'eval':
            with torch.no_grad():
                # forward pass
                predictions = classifier(list(sentences), entities_1, entities_2)
                loss = criterion(predictions['flat_logits'], labels)

        # calculate and store accuracy metrics
        stats['loss'].append(float(loss.detach()))
        evaluation_metrics = criterion.get_classification_report(predictions['labels'], labels)
        stats['micro-f1'].append(evaluation_metrics['accuracy'])
        stats['macro-f1'].append(evaluation_metrics['macro avg']['f1-score'])
        stats['weighted-f1'].append(evaluation_metrics['weighted avg']['f1-score'])

        # store predictions
        if return_predictions:
            # iterate over inputs items
            for sidx in range(predictions['labels'].shape[0]):
                # append non-padding predictions as list
                predicted_labels = predictions['labels'][sidx]
                stats['predictions'].append(predicted_labels[predicted_labels != -1].item())

        # print batch statistics
        sys.stdout.write(
                f"\r[{mode.capitalize()} | Batch {batch_idx}] "
                f"Micro-f1: {np.mean(stats['micro-f1']):.4f}, "
                f"Macro-f1: {np.mean(stats['macro-f1']):.4f}, "
                f"Weighted-f1: {np.mean(stats['weighted-f1']):.4f}, "
                f"Loss: {np.mean(stats['loss']):.4f}")
        sys.stdout.flush()

    # clear line
    print("\r", end='')

    return stats

In [5]:
train_data = get_all_crossre(data_path, topics, batch_size)
dev_data = get_all_crossre(data_path, topics, batch_size, dataset='dev')
# test_data = get_all_crossre(data_path, topics, dataset='test')

In [6]:
for sentences, entities_1, entities_2, labels in train_data:
    print(sentences[0], entities_1[0], entities_2[0], labels[0])

Popular approaches of <E1:product> opinion-based recommender system </E1:product> utilize various techniques including <E2:field> text mining </E2:field> , information retrieval , sentiment analysis ( see also Multimodal sentiment analysis ) and deep learning X.Y. Feng , H. Zhang , Y.J. Ren , P.H. Shang , Y. Zhu , Y.C. Liang , R.C. Guan , D. Xu , ( 2019 ) , , 21 ( 5 ) : e12957 . tensor(3) tensor(12) tensor(0)
Several of these programs are available online , such as Google Translate and the <E2:product> SYSTRAN system </E2:product> that powers AltaVista 's <E1:product> BabelFish </E1:product> ( now Yahoo 's Babelfish as of 9 May 2008 ) . tensor(22) tensor(14) tensor(2)
Examples of <E2:field> supervised learning </E2:field> are <E1:algorithm> Naive Bayes classifier </E1:algorithm> , Support vector machine , mixtures of Gaussians , and network . tensor(7) tensor(2) tensor(14)
Voice user interfaces that interpret and manage conversational state are challenging to design due to the inherent

  self.handle: torch.Tensor = torch.zeros(1)


# Music Baseline

In [7]:
music_train_data = preprocessing.prepare_data(f'{data_path}music-train.json', label2idx, 32)