In [1]:
!pip install datasets

!pip install transformers
!pip install sentencepiece

!pip install sacrebleu



In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/webnlg

/content/drive/MyDrive/webnlg


In [4]:
import datasets, re
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.nn.parallel import DataParallel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

model_reasoning = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
model_reasoning = DataParallel(model_reasoning)

model_output = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
model_output = DataParallel(model_output)

new_tokens = ['<H>', '<R>', '<T>', '[SEP]']
new_tokens_vocab = {}
new_tokens_vocab['additional_special_tokens'] = []
for idx, t in enumerate(new_tokens):
    new_tokens_vocab['additional_special_tokens'].append(t)
num_added_toks = tokenizer.add_special_tokens(new_tokens_vocab)

tokenizer.add_tokens("[MASK]")
tokenizer.mask_token = "[MASK]"
tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")

In [5]:
tokenizer

T5TokenizerFast(name_or_path='google/flan-t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '[MASK]', 'additional_special_tokens': ['<H>', '<R>', '<T>', '[SEP]']}, clean_up_tokenization_spaces=True)

In [6]:
class WebNLGDatasetReasoning(Dataset):
    def __init__(self, data):
        self.data = data
        self.prefix = "translate from Graph to Text: "

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # preprocess the input graph
        try:
            triples = item['modified_triple_sets']['mtriple_set']
            input_text = self.prefix
            for outer_list in triples:
                for triple in outer_list:
                    triple_txt = triple.split("|")
                    input_text += " <H> " + triple_txt[0] + " <R> " + triple_txt[1] + " <T> " + triple_txt[2] + " [SEP]"

        except (KeyError, IndexError):
            print("1 - WebNLGDatasetReasoning")
            print(item['modified_triple_sets']['mtriple_set'])
            print(item['modified_triple_sets']['mtriple_set'][0])
            print(triples)
            input_text = self.prefix
        # preprocess the target text
        try:
            target_text = item['reasoning']
        except (KeyError, IndexError):
            print("2 - WebNLGDatasetReasoning")
            print(item)
            #print(item['original_triple_sets']['otriple_set'])
            target_text = ""
        #print(item)
        #print(input_text)
        # encode the inputs and targets using the tokenizer
        input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        target_ids = tokenizer.encode(target_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        #print(input_text)
        #print(target_text)
        return input_ids.squeeze(0), target_ids.squeeze(0)


In [7]:
class WebNLGDatasetOutput(Dataset):
    def __init__(self, data):
        self.data = data
        self.prefix = "translate from Graph to Text: "

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # preprocess the input graph
        try:
            input_text = item['reasoning']
        except (KeyError, IndexError):
            print("1 - WebNLGDatasetOutput")
            print(item['original_triple_sets']['otriple_set'])
            print(item['original_triple_sets']['otriple_set'][0])
            print(item['reasoning'])
            input_text = self.prefix
        # preprocess the target text
        try:
            target_text = item['lex']['text'][0]
        except (KeyError, IndexError):
            print("2 - WebNLGDatasetOutput")
            print(item)
            #print(item['original_triple_sets']['otriple_set'])
            target_text = ""
        #print(item)
        #print(input_text)
        # encode the inputs and targets using the tokenizer
        input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        target_ids = tokenizer.encode(target_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        #print(input_text)
        #print(target_text)
        return input_ids.squeeze(0), target_ids.squeeze(0)


In [8]:
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
tokenizer.model_max_length = MAX_INPUT_LENGTH
model_reasoning.module.config.max_length = MAX_TARGET_LENGTH
model_output.module.config.max_length = MAX_TARGET_LENGTH

# set up the device (GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_reasoning.to(device)
model_output.to(device)

DataParallel(
  (module): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=384, bias=False)
                (k): Linear(in_features=512, out_features=384, bias=False)
                (v): Linear(in_features=512, out_features=384, bias=False)
                (o): Linear(in_features=384, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 6)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseGatedActDense(
                (wi_0): Linear(in_features=512, out_features=1024, bias=False)
                (wi_1): Linear(

In [9]:
import pickle

with open('final_dataset.pkl', 'rb') as f:
  final_dataset = pickle.load(f)

In [None]:
final_dataset[0]

In [10]:
import random

random.Random(32).shuffle(final_dataset)

In [11]:
dataset_raw_train = final_dataset[:700]
dataset_raw_val = final_dataset[700:900]
dataset_raw_test = final_dataset[900:]

In [12]:
import pandas as pd
dataset_raw_train[0]

{'category': 'Athlete',
 'size': 5,
 'eid': 'Id59',
 'original_triple_sets': {'otriple_set': [['Moscow | leaderName | Sergey_Sobyanin',
    'Aleksandr_Chumakov | placeOfBirth | Moscow',
    'Aleksandr_Chumakov | clubs | FC_Torpedo_Moscow',
    'Aleksandr_Chumakov | team | Soviet_Union_national_football_team',
    'FC_Torpedo_Moscow | manager | Valery_Petrakov']]},
 'modified_triple_sets': {'mtriple_set': [['Moscow | leader | Sergey_Sobyanin',
    'Aleksandr_Chumakov | birthPlace | Moscow',
    'Aleksandr_Chumakov | club | FC_Torpedo_Moscow',
    'Aleksandr_Chumakov | club | Soviet_Union_national_football_team',
    'FC_Torpedo_Moscow | manager | Valery_Petrakov']]},
 'shape': '(X (X) (X (X)) (X (X)))',
 'shape_type': 'mixed',
 'lex': {'comment': ['good', 'good', 'good'],
  'lid': ['Id1', 'Id2', 'Id3'],
  'text': ['Aleksandr Chumakov was born in Moscow, Russia which is led by Sergey Sobyanin. His club was FC Torpedo Moscow where Valery Petrakov is the manager, but he also plays for the 

In [None]:
dataset_mul_ref_train = []

for i in range(len(dataset_raw_train)):
  sample = dataset_raw_train[i]
  target_list = sample['lex']['text']
  reason = sample['output']
  mod_text_list = []
  for a in target_list:
    b = reason + " [SEP]" + a
    mod_text_list.append(b)
  sample['text'] = mod_text_list
  dataset_mul_ref_train.append(sample)

dataset_mul_ref_val = []

for i in range(len(dataset_raw_val)):
  sample = dataset_raw_val[i]
  target_list = sample['lex']['text']
  reason = sample['output']
  mod_text_list = []
  for a in target_list:
    b = reason + " [SEP]" + a
    mod_text_list.append(b)
  sample['text'] = mod_text_list
  dataset_mul_ref_val.append(sample)

dataset_mul_ref_test = []

for i in range(len(dataset_raw_test)):
  sample = dataset_raw_test[i]
  target_list = sample['lex']['text']
  reason = sample['output']
  mod_text_list = []
  for a in target_list:
    b = reason + " [SEP]" + a
    mod_text_list.append(b)
  sample['text'] = mod_text_list
  dataset_mul_ref_test.append(sample)

In [13]:
train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=dataset_raw_train))
val_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=dataset_raw_val))
test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=dataset_raw_test))

In [None]:
train_dataset[0]

In [14]:
dataset_train_reasoning = WebNLGDatasetReasoning(train_dataset)
dataset_val_reasoning = WebNLGDatasetReasoning(val_dataset)
dataset_test_reasoning = WebNLGDatasetReasoning(test_dataset)
dataset_train_output = WebNLGDatasetOutput(train_dataset)
dataset_val_output = WebNLGDatasetOutput(val_dataset)
dataset_test_output = WebNLGDatasetOutput(test_dataset)

In [15]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=3, verbose=False):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss


# Adaptive pretraining

For STA, we fine-tuned the PLMs on a small amount of labeled data from the target task using a maximum likelihood estimation (MLE) objective. This involves training the model to maximize the likelihood of generating the correct output given the input graph and labeled data. This process helps to further adapt the PLM to the specific requirements of the target task and improve its performance on that task.

In [None]:
import random

pretrain_texts = []
for sample in dataset_raw_train:
    try:
        text = sample['lex']['text'][0]
        pretrain_texts.append(text)
    except (KeyError, IndexError):
        continue

tokenized_inputs = tokenizer(pretrain_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids = tokenized_inputs['input_ids']
attention_mask = tokenized_inputs['attention_mask']

pretrain_data = torch.utils.data.TensorDataset(input_ids, attention_mask)

pretrain_loader = torch.utils.data.DataLoader(pretrain_data, batch_size=int(60), shuffle=True)

pretrain_optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
pretrain_criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

pretrain_epochs = 30  # Set the number of pre-training epochs
masking_prob = 0.15  # Probability of masking a token


# Prepare validation data
val_texts = []
for sample in dataset_raw_val:
    try:
        text = sample['text'][0]
        val_texts.append(text)
    except (KeyError, IndexError):
        continue

tokenized_inputs_val = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids_val = tokenized_inputs_val['input_ids']
attention_mask_val = tokenized_inputs_val['attention_mask']

val_data = TensorDataset(input_ids_val, attention_mask_val)

val_loader = DataLoader(val_data, batch_size=int(60), shuffle=True)

early_stopping = EarlyStopping(patience=3, verbose=True)

if tokenizer.mask_token is None:
    # Manually set a mask token if not already defined
    tokenizer.add_tokens("[MASK]")
    tokenizer.mask_token = "[MASK]"
    tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")

for epoch in range(pretrain_epochs):
    running_loss = 0.0
    for inputs, attention_mask in pretrain_loader:
        inputs = inputs.to(device)
        attention_mask = attention_mask.to(device)
        batch_size, seq_length = inputs.shape

        # Create a mask for randomly selected tokens
        mask = torch.rand(inputs.shape) < masking_prob

        # Randomly replace selected tokens with [MASK] token
        masked_inputs = inputs.clone()
        masked_inputs[mask] = tokenizer.mask_token_id

        pretrain_optimizer.zero_grad()
        outputs = model(input_ids=masked_inputs, attention_mask=attention_mask, decoder_input_ids=inputs)

        # Compute the loss only for the masked tokens
        masked_logits = outputs.logits[mask]
        masked_labels = inputs[mask]
        loss = pretrain_criterion(masked_logits.view(-1, masked_logits.size(-1)), masked_labels.view(-1))

        loss.backward()
        pretrain_optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(pretrain_data)
    print(f"Pretrain Epoch {epoch+1}/{pretrain_epochs} - loss: {epoch_loss:.4f}")

    # Validation
    model.eval()
    val_running_loss = 0.0
    for val_inputs, val_attention_mask in val_loader:
        val_inputs = val_inputs.to(device)
        val_attention_mask = val_attention_mask.to(device)
        batch_size, seq_length = val_inputs.shape

        mask = torch.rand(val_inputs.shape) < masking_prob
        masked_inputs = val_inputs.clone()
        masked_inputs[mask] = tokenizer.mask_token_id

        with torch.no_grad():
            outputs = model(input_ids=masked_inputs, attention_mask=val_attention_mask, decoder_input_ids=val_inputs)
            masked_logits = outputs.logits[mask]
            masked_labels = val_inputs[mask]
            val_loss = pretrain_criterion(masked_logits.view(-1, masked_logits.size(-1)), masked_labels.view(-1))

        val_running_loss += val_loss.item() * val_inputs.size(0)

    epoch_val_loss = val_running_loss / len(val_data)
    print(f"Val Epoch {epoch+1}/{pretrain_epochs} - loss: {epoch_val_loss:.4f}")

    early_stopping(epoch_val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping")
        break


For LMA, we first fine-tuned the PLMs on a small amount of task-specific data using a masked language modeling objective. This involves randomly masking some tokens in the input sequence and training the model to predict the masked tokens based on the context provided by the unmasked tokens. This process helps to adapt the PLM to the specific characteristics of the target task and improve its performance on that task.

# Finetuning

In [16]:
# set up the data loader
#train_data = WebNLGDataset(dataset)
batch_size = 32
train_loader_reasoning = DataLoader(dataset_train_reasoning, batch_size=batch_size, shuffle=True)
val_loader_reasoning = DataLoader(dataset_val_reasoning, batch_size=batch_size, shuffle=True)

num_epochs = 20
optimizer = torch.optim.AdamW(model_reasoning.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
early_stopping = EarlyStopping(patience=2, verbose=True)

for epoch in range(num_epochs):
    model_reasoning.train()
    running_loss = 0.0
    for inputs, targets in train_loader_reasoning:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model_reasoning(inputs, labels=targets)
        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(dataset_train_reasoning)
    print(f"Train loss: {epoch_loss:.4f}")

    model_reasoning.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for val_inputs, val_targets in val_loader_reasoning:
            val_inputs = val_inputs.to(device)
            val_targets = val_targets.to(device)
            val_outputs = model_reasoning(val_inputs, labels=val_targets)
            val_loss = criterion(val_outputs.logits.view(-1, val_outputs.logits.size(-1)), val_targets.view(-1))
            running_val_loss += val_loss.item() * val_inputs.size(0)
    epoch_val_loss = running_val_loss / len(dataset_val_reasoning)
    print(f"Val loss: {epoch_val_loss:.4f}")

    early_stopping(epoch_val_loss, model_reasoning)

    if early_stopping.early_stop:
        print("Early stopping")
        break

Train loss: 1.2191
Val loss: 0.6961
Validation loss decreased (inf --> 0.696091).  Saving model ...
Train loss: 0.8757
Val loss: 0.6067
Validation loss decreased (0.696091 --> 0.606667).  Saving model ...
Train loss: 0.7590
Val loss: 0.5436
Validation loss decreased (0.606667 --> 0.543637).  Saving model ...
Train loss: 0.6843
Val loss: 0.5030
Validation loss decreased (0.543637 --> 0.503049).  Saving model ...
Train loss: 0.6181
Val loss: 0.4725
Validation loss decreased (0.503049 --> 0.472490).  Saving model ...
Train loss: 0.5699
Val loss: 0.4475
Validation loss decreased (0.472490 --> 0.447453).  Saving model ...
Train loss: 0.5253
Val loss: 0.4293
Validation loss decreased (0.447453 --> 0.429292).  Saving model ...
Train loss: 0.4941
Val loss: 0.4115
Validation loss decreased (0.429292 --> 0.411468).  Saving model ...
Train loss: 0.4717
Val loss: 0.4001
Validation loss decreased (0.411468 --> 0.400082).  Saving model ...
Train loss: 0.4372
Val loss: 0.3809
Validation loss decrease

In [22]:
# set up the data loader
batch_size = 32
train_loader_output = DataLoader(dataset_train_output, batch_size=batch_size, shuffle=True)
val_loader_output = DataLoader(dataset_val_output, batch_size=batch_size, shuffle=True)

num_epochs = 20
optimizer = torch.optim.AdamW(model_output.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
early_stopping = EarlyStopping(patience=2, verbose=True)

for epoch in range(num_epochs):
    model_output.train()
    running_loss = 0.0
    for inputs, targets in train_loader_output:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model_output(inputs, labels=targets)
        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(dataset_train_output)
    print(f"Epoch {epoch+1}/{num_epochs} - loss: {epoch_loss:.4f}")

    model_output.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for val_inputs, val_targets in val_loader_output:
            val_inputs = val_inputs.to(device)
            val_targets = val_targets.to(device)
            val_outputs = model_output(val_inputs, labels=val_targets)
            val_loss = criterion(val_outputs.logits.view(-1, val_outputs.logits.size(-1)), val_targets.view(-1))
            running_val_loss += val_loss.item() * val_inputs.size(0)
    epoch_val_loss = running_val_loss / len(dataset_val_output)
    print(f"Val loss: {epoch_val_loss:.4f}")

    early_stopping(epoch_val_loss, model_output)

    if early_stopping.early_stop:
        print("Early stopping")
        break

Epoch 1/20 - loss: 1.3365
Val loss: 0.9906
Validation loss decreased (inf --> 0.990579).  Saving model ...
Epoch 2/20 - loss: 1.0979
Val loss: 0.9321
Validation loss decreased (0.990579 --> 0.932098).  Saving model ...
Epoch 3/20 - loss: 1.0402
Val loss: 0.9202
Validation loss decreased (0.932098 --> 0.920215).  Saving model ...
Epoch 4/20 - loss: 0.9916
Val loss: 0.8988
Validation loss decreased (0.920215 --> 0.898799).  Saving model ...
Epoch 5/20 - loss: 0.9604
Val loss: 0.8839
Validation loss decreased (0.898799 --> 0.883905).  Saving model ...
Epoch 6/20 - loss: 0.9338
Val loss: 0.8772
Validation loss decreased (0.883905 --> 0.877161).  Saving model ...
Epoch 7/20 - loss: 0.8983
Val loss: 0.8725
Validation loss decreased (0.877161 --> 0.872455).  Saving model ...
Epoch 8/20 - loss: 0.8777
Val loss: 0.8622
Validation loss decreased (0.872455 --> 0.862167).  Saving model ...
Epoch 9/20 - loss: 0.8538
Val loss: 0.8646
EarlyStopping counter: 1 out of 2
Epoch 10/20 - loss: 0.8326
Val l

In [None]:
model.load_state_dict(torch.load('/kaggle/working/checkpoint.pt'))

In [28]:
# Save the entire model
torch.save(model_reasoning, 'model_T5_flan_small_2020_reasoning_v2_es')
print("Model saved successfully.")
torch.save(model_output, 'model_T5_flan_small_2020_output_v2_es')
print("Model saved successfully.")

Model saved successfully.
Model saved successfully.


In [None]:
# Load the model
#model = torch.load('/kaggle/input/models/model_T5_flan_small_multi')

# Print a confirmation message
print("Model loaded successfully.")

# Inference

## are we accounting for the multiple texts targets in the bleu? it doesn't look like it

In [17]:
batch_size=32

In [18]:
from sacrebleu import corpus_bleu
from random import sample
from tqdm import tqdm

test_loader_reasoning = DataLoader(dataset_test_reasoning, batch_size=batch_size, shuffle=False)
# also we use the test_dataset to get the multiple references
"""
# load the WebNLG validation dataset
validation_dataset = load_dataset('web_nlg', 'release_v3.0_en')['test']
validation_dataset = [sample for sample in validation_dataset if sample['lex']['text']] # filter out samples with empty targets
validation_dataset = validation_dataset[:5]
# Select a subset of the validation dataset
#subset_size = 10  # Choose the desired subset size
#validation_subset = sample(list(validation_dataset), subset_size)
validation_data = WebNLGDataset(validation_dataset)

# set up the validation data loader
validation_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)
"""
# switch model to evaluation mode
model_reasoning.eval()

# generate predictions for the validation dataset
predictions_reasoning = []
references_reasoning = []
with torch.no_grad():
    for idx, (inputs, targets) in enumerate(tqdm(test_loader_reasoning, desc='Validation Progress', leave=False)):
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model_reasoning.module.generate(inputs, max_length=MAX_TARGET_LENGTH, num_beams=4)
        # convert token IDs to strings
        predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        target_texts = tokenizer.batch_decode(targets, skip_special_tokens=True)
        # append predicted and target texts for BLEU evaluation
        predictions_reasoning.extend(predicted_texts)
        references_reasoning.extend([np.array(test_dataset)[i]["reasoning"] for i in range(idx*batch_size,min((idx+1)*batch_size, len(test_dataset)))])



In [23]:
test_loader_output = DataLoader(dataset_test_output, batch_size=batch_size, shuffle=False)
# also we use the test_dataset to get the multiple references
"""
# load the WebNLG validation dataset
validation_dataset = load_dataset('web_nlg', 'release_v3.0_en')['test']
validation_dataset = [sample for sample in validation_dataset if sample['lex']['text']] # filter out samples with empty targets
validation_dataset = validation_dataset[:5]
# Select a subset of the validation dataset
#subset_size = 10  # Choose the desired subset size
#validation_subset = sample(list(validation_dataset), subset_size)
validation_data = WebNLGDataset(validation_dataset)

# set up the validation data loader
validation_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)
"""
# switch model to evaluation mode
model_output.eval()

# generate predictions for the validation dataset
predictions_output = []
references_output = []
multiple_references_output = []
with torch.no_grad():
    for idx, (inputs, targets) in enumerate(tqdm(test_loader_output, desc='Validation Progress', leave=False)):
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model_output.module.generate(inputs, max_length=MAX_TARGET_LENGTH, num_beams=4)
        # convert token IDs to strings
        predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        target_texts = tokenizer.batch_decode(targets, skip_special_tokens=True)
        # append predicted and target texts for BLEU evaluation
        predictions_output.extend(predicted_texts)
        multiple_references_output.extend([np.array(test_dataset)[i]["lex"]["text"] for i in range(idx*batch_size,min((idx+1)*batch_size, len(test_dataset)))])
        references_output.extend([np.array(test_dataset)[i]["lex"]["text"][0] for i in range(idx*batch_size,min((idx+1)*batch_size, len(test_dataset)))])



In [26]:
# switch models to evaluation mode
model_reasoning.eval()
model_output.eval()

# generate predictions for the validation dataset
predictions_reasoning = []
predictions_output = []
references_reasoning = []
references_output = []
multiple_references_output = []

# Use tqdm for progress bar
with torch.no_grad():
    for idx, (inputs, targets) in enumerate(tqdm(test_loader_reasoning, desc='Validation Progress', leave=False)):
        inputs = inputs.to(device)
        targets = targets.to(device)

        # pass inputs through the reasoning model
        outputs_reasoning = model_reasoning.module.generate(inputs, max_length=MAX_TARGET_LENGTH, num_beams=4)

        # convert token IDs to strings
        predicted_texts_reasoning = tokenizer.batch_decode(outputs_reasoning, skip_special_tokens=True)
        target_texts_reasoning = tokenizer.batch_decode(targets, skip_special_tokens=True)

        # append predicted and target texts for BLEU evaluation
        predictions_reasoning.extend(predicted_texts_reasoning)
        references_reasoning.extend([np.array(test_dataset)[i]["reasoning"] for i in range(idx*batch_size,min((idx+1)*batch_size, len(test_dataset)))])

        # feed the reasoning model's outputs into the output model
        outputs_output = model_output.module.generate(outputs_reasoning, max_length=MAX_TARGET_LENGTH, num_beams=4)

        # convert token IDs to strings
        predicted_texts_output = tokenizer.batch_decode(outputs_output, skip_special_tokens=True)
        target_texts_output = tokenizer.batch_decode(targets, skip_special_tokens=True)

        # append predicted and target texts for BLEU evaluation
        predictions_output.extend(predicted_texts_output)
        references_output.extend([np.array(test_dataset)[i]["lex"]["text"][0] for i in range(idx*batch_size,min((idx+1)*batch_size, len(test_dataset)))])
        multiple_references_output.extend([np.array(test_dataset)[i]["lex"]["text"] for i in range(idx*batch_size,min((idx+1)*batch_size, len(test_dataset)))])



In [29]:
i=2
print(test_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print("reasoning prediction: ", predictions_reasoning[i])
print('----')
print("reasoning reference: ", references_reasoning[i])
print('----')
print("output prediction: ", predictions_output[i])
print('----')
print("output reference: ", references_output[i])
print('----')

[['Allen_Forrest | genre | Rhythm_and_blues', 'Allen_Forrest | birthYear | 1981', 'Allen_Forrest | birthPlace | "Fort Campbell, KY, raised in Dothan, AL"', 'Allen_Forrest | background | "solo_singer"', 'Allen_Forrest | birthPlace | Fort_Campbell']]
----
reasoning prediction:  Allen Forrest's genre is rhythm and blues. Allen Forrest was born in 1981. Allen Forrest was born in Fort Campbell, KY, raised in Dothan, AL. Allen Forrest has a background as a solo singer. Allen Forrest was born in Fort Campbell, KY.
----
reasoning reference:  Allen Forrest's genre is Rhythm and blues. Allen Forrest was born in 1981. Allen Forrest was born in Fort Campbell, KY and raised in Dothan, AL. Allen Forrest has a background as a solo singer.
----
output prediction:  Allen Forrest was born in 1981 in Fort Campbell, KY, raised in Dothan, AL. He is a rhythm and blues performer who was born in 1981 in Fort Campbell, KY. He has a background as a solo singer and is a rhythm and blues performer.
----
output re

In [30]:
# Writing predictions to a .txt file
with open("predictions_reasoning_version2_es", "w") as f:
    for prediction in predictions_reasoning:
        f.write(prediction + "\n")
# Writing predictions to a .txt file
with open("predictions_output_version2_es", "w") as f:
    for prediction in predictions_output:
        f.write(prediction + "\n")

In [31]:
# calculate BLEU scores
#bleu = corpus_bleu(predictions, [references])
"""
multiple_references = []
for i in range(len(validation_dataset)):
    multiple_references.append(validation_dataset[i]['lex']['text'])
"""
bleu = corpus_bleu(predictions_reasoning, references_reasoning)
# bleu_multiple = corpus_bleu(predictions, multiple_references)

print(f"BLEU score reasoning: {bleu.score}")
# print(f"BLEU score with multiple references: {bleu_multiple.score}")

bleu = corpus_bleu(predictions_output, references_output)
bleu_multiple = corpus_bleu(predictions_output, multiple_references_output)

print(f"BLEU score output: {bleu.score}")
print(f"BLEU score with multiple references: {bleu_multiple.score}")

BLEU score reasoning: 0.02135099363252779
BLEU score output: 0.025183204076820096
BLEU score with multiple references: 77.68917956332257


In [None]:
# Getting the maximum length of the sublists in multiple_references
max_length = max(len(sublist) for sublist in multiple_references)

# Writing multiple_references to separate .txt files
for i in range(max_length):
    with open(f"references{i}", "w") as f:
        for ref_list in multiple_references:
            # Writing the ith element if it exists, otherwise an empty line
            if i < len(ref_list):
                f.write(ref_list[i] + "\n")
            else:
                f.write("\n")

In [None]:
# First, determine the maximum length of sublists
max_len = max(len(refs) for refs in multiple_references)

# Then pad all sublists to that length
padded_references = [refs * (max_len // len(refs)) + refs[:max_len % len(refs)] for refs in multiple_references]

bleu = corpus_bleu(predictions, references)
bleu_multiple = corpus_bleu(predictions, padded_references)

print(f"BLEU score: {bleu.score}")
print(f"BLEU score with padded references: {bleu_multiple.score}")

In [None]:
len(multiple_references_output)

In [None]:
padded_references

In [32]:
from datasets import load_metric

metric = load_metric('sacrebleu')

# First, determine the maximum length of sublists
max_len = max(len(refs) for refs in multiple_references_output)

# Then pad all sublists to that length
padded_references = []
for ref in multiple_references_output:
  for i in range(max_len-len(ref)):
    ref.append("")
  padded_references.append(ref)
# padded_references = [refs * (max_len // len(refs)) + refs[:max_len % len(refs)] for refs in multiple_references_output]

# Now 'padded_references' is a list of lists, where each sublist has the same length.
# We can now compute the SacreBLEU score.

# Note the change in the compute line
score = metric.compute(predictions=predictions_output, references = padded_references)

print(f"SacreBLEU score: {score['score']}")

  metric = load_metric('sacrebleu')


SacreBLEU score: 41.807585882110935


In [33]:
from sacrebleu import corpus_chrf
# Calculate CHR F++ scores
chrf = corpus_chrf(predictions_output, [references_output])
chrf_multiple = corpus_chrf(predictions_output, multiple_references_output)
print(f"CHR F++ score: {chrf.score}")
print(f"CHR F++ score with multiple references: {chrf_multiple.score}")

CHR F++ score: 62.58863281228343
CHR F++ score with multiple references: 55.839157211548326


In [34]:
from sacrebleu import corpus_chrf
# Calculate CHR F++ scores
chrf = corpus_chrf(predictions_output, [references_output])
chrf_multiple = corpus_chrf(predictions_output, padded_references)
print(f"CHR F++ score: {chrf.score}")
print(f"CHR F++ score with multiple references: {chrf_multiple.score}")

CHR F++ score: 62.58863281228343
CHR F++ score with multiple references: 55.839157211548326


In [35]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [36]:
from datasets import load_metric
import numpy as np


metric = load_metric('bertscore')

assert len(predictions_output) == len(references_output), "The number of predictions and references should be the same."

# Compute the score
score = metric.compute(predictions=predictions_output, references=references_output, lang='en')

print(f"BERTScore: {np.mean(score['precision'])}")

Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: 0.9233890253305436


In [None]:
from transformers import T5Tokenizer
from datasets import load_dataset

# Define the tokenizer
#tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Load the WebNLG dataset
dataset = load_dataset('web_nlg', 'release_v3.0_en')['test']
dataset = [sample for sample in dataset if sample['lex']['text']]

# Create an instance of WebNLGDataset
webnlg_dataset = WebNLGDataset(dataset)

# Define the index of the example you want to test
example_index = 70

# Get the input and target texts for the example at the specified index
input_text, target_text = webnlg_dataset[example_index]

# Decode the input and target texts using the tokenizer
decoded_input_text = tokenizer.decode(input_text, skip_special_tokens=True)
decoded_target_text = tokenizer.decode(target_text, skip_special_tokens=True)

# Print the preprocessed input and target texts
print("Input Text:", decoded_input_text)
print("Target Text:", decoded_target_text)


## seeing how many empty targets there are in the testing set

In [None]:
dataset = load_dataset('web_nlg', 'release_v3.0_en')['test']
count_empty_text = 0
for sample in dataset:
    if not sample['lex']['text']:
        count_empty_text += 1

print(f"Number of samples with empty 'lex' 'text' field: {count_empty_text}")


In [None]:
total_samples = len(dataset)
print(f"Total number of samples in the test dataset: {total_samples}")