In [1]:
'''
!pip install datasets

!pip install transformers 
!pip install sentencepiece

!pip install sacrebleu
'''

'\n!pip install datasets\n\n!pip install transformers \n!pip install sentencepiece\n\n!pip install sacrebleu\n'

In [28]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.nn.parallel import DataParallel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
model = DataParallel(model)

In [29]:
new_tokens = ['<H>', '<R>', '<T>']
new_tokens_vocab = {}
new_tokens_vocab['additional_special_tokens'] = []
for idx, t in enumerate(new_tokens):
    new_tokens_vocab['additional_special_tokens'].append(t)
num_added_toks = tokenizer.add_special_tokens(new_tokens_vocab)

tokenizer.add_tokens("[MASK]")
tokenizer.mask_token = "[MASK]"
tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")

In [30]:
class WebNLGDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.prefix = "translate from Graph to Text: "

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # preprocess the input graph
        try:
            triples = item['modified_triple_sets']['mtriple_set']
            input_text = self.prefix
            for outer_list in triples: 
                for triple in outer_list:
                    triple_txt = triple.split("|")
                    input_text += " <H> " + triple_txt[0] + " <R> " + triple_txt[1] + " <T> " + triple_txt[2]
        except (KeyError, IndexError):
            print("1")
            print(item['modified_triple_sets']['mtriple_set'])
            print(item['modified_triple_sets']['mtriple_set'][0])
            print(triples)
            input_text = self.prefix
        # preprocess the target text
        try:
            target_text = item['lex']['text'][0]
        except (KeyError, IndexError):
            print("2")
            print(item)
            #print(item['modified_triple_sets']['mtriple_set'])
            target_text = ""
        #print(item)
        #print(input_text)
        # encode the inputs and targets using the tokenizer
        input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        target_ids = tokenizer.encode(target_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        return input_ids.squeeze(0), target_ids.squeeze(0)


In [31]:
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
tokenizer.model_max_length = MAX_INPUT_LENGTH
model.module.config.max_length = MAX_TARGET_LENGTH

# set up the device (GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

DataParallel(
  (module): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=384, bias=False)
                (k): Linear(in_features=512, out_features=384, bias=False)
                (v): Linear(in_features=512, out_features=384, bias=False)
                (o): Linear(in_features=384, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 6)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseGatedActDense(
                (wi_0): Linear(in_features=512, out_features=1024, bias=False)
                (wi_1): Linear(

In [32]:
# load the WebNLG dataset
dataset = load_dataset('web_nlg', 'release_v3.0_en')['train']
dataset_val = load_dataset('web_nlg', 'release_v3.0_en')['dev']

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [33]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss


# Adaptive pretraining

For STA, we fine-tuned the PLMs on a small amount of labeled data from the target task using a maximum likelihood estimation (MLE) objective. This involves training the model to maximize the likelihood of generating the correct output given the input graph and labeled data. This process helps to further adapt the PLM to the specific requirements of the target task and improve its performance on that task.

In [34]:
import random

pretrain_batch_size = 60

pretrain_texts = []
for sample in dataset:
    try:
        text = sample['lex']['text'][0]
        pretrain_texts.append(text)
    except (KeyError, IndexError):
        continue

tokenized_inputs = tokenizer(pretrain_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids = tokenized_inputs['input_ids']
attention_mask = tokenized_inputs['attention_mask']

pretrain_data = torch.utils.data.TensorDataset(input_ids, attention_mask)

pretrain_loader = torch.utils.data.DataLoader(pretrain_data, batch_size=pretrain_batch_size, shuffle=True)

pretrain_optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
pretrain_criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

pretrain_epochs = 20  # Set the number of pre-training epochs
masking_prob = 0.15  # Probability of masking a token


# Prepare validation data
val_texts = []
for sample in dataset_val:
    try:
        text = sample['lex']['text'][0]
        val_texts.append(text)
    except (KeyError, IndexError):
        continue

tokenized_inputs_val = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids_val = tokenized_inputs_val['input_ids']
attention_mask_val = tokenized_inputs_val['attention_mask']

val_data = TensorDataset(input_ids_val, attention_mask_val)

val_loader = DataLoader(val_data, batch_size=pretrain_batch_size, shuffle=True)

early_stopping = EarlyStopping(patience=2, verbose=True)

if tokenizer.mask_token is None:
    # Manually set a mask token if not already defined
    tokenizer.add_tokens("[MASK]")
    tokenizer.mask_token = "[MASK]"
    tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")

for epoch in range(pretrain_epochs):
    running_loss = 0.0
    for inputs, attention_mask in pretrain_loader:
        inputs = inputs.to(device)
        attention_mask = attention_mask.to(device)
        batch_size, seq_length = inputs.shape
        
        # Create a mask for randomly selected tokens
        mask = torch.rand(inputs.shape) < masking_prob
        
        # Randomly replace selected tokens with [MASK] token
        masked_inputs = inputs.clone()
        masked_inputs[mask] = tokenizer.mask_token_id
        
        pretrain_optimizer.zero_grad()
        outputs = model(input_ids=masked_inputs, attention_mask=attention_mask, decoder_input_ids=inputs)
        
        # Compute the loss only for the masked tokens
        masked_logits = outputs.logits[mask]
        masked_labels = inputs[mask]
        loss = pretrain_criterion(masked_logits.view(-1, masked_logits.size(-1)), masked_labels.view(-1))
        
        loss.backward()
        pretrain_optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(pretrain_data)
    print(f"Pretrain Epoch {epoch+1}/{pretrain_epochs} - loss: {epoch_loss:.4f}")
    
    # Validation
    model.eval()
    val_running_loss = 0.0
    for val_inputs, val_attention_mask in val_loader:
        val_inputs = val_inputs.to(device)
        val_attention_mask = val_attention_mask.to(device)
        batch_size, seq_length = val_inputs.shape

        mask = torch.rand(val_inputs.shape) < masking_prob
        masked_inputs = val_inputs.clone()
        masked_inputs[mask] = tokenizer.mask_token_id

        with torch.no_grad():
            outputs = model(input_ids=masked_inputs, attention_mask=val_attention_mask, decoder_input_ids=val_inputs)
            masked_logits = outputs.logits[mask]
            masked_labels = val_inputs[mask]
            val_loss = pretrain_criterion(masked_logits.view(-1, masked_logits.size(-1)), masked_labels.view(-1))

        val_running_loss += val_loss.item() * val_inputs.size(0)

    epoch_val_loss = val_running_loss / len(val_data)
    print(f"Val Epoch {epoch+1}/{pretrain_epochs} - loss: {epoch_val_loss:.4f}")

    early_stopping(epoch_val_loss, model)

    if early_stopping.early_stop:
        print("Early stopping")
        break


Pretrain Epoch 1/20 - loss: 2.1126
Val Epoch 1/20 - loss: 0.3252
Validation loss decreased (inf --> 0.325190).  Saving model ...
Pretrain Epoch 2/20 - loss: 0.1685
Val Epoch 2/20 - loss: 0.0884
Validation loss decreased (0.325190 --> 0.088390).  Saving model ...
Pretrain Epoch 3/20 - loss: 0.0598
Val Epoch 3/20 - loss: 0.0425
Validation loss decreased (0.088390 --> 0.042550).  Saving model ...
Pretrain Epoch 4/20 - loss: 0.0378
Val Epoch 4/20 - loss: 0.0394
Validation loss decreased (0.042550 --> 0.039399).  Saving model ...
Pretrain Epoch 5/20 - loss: 0.0241
Val Epoch 5/20 - loss: 0.0207
Validation loss decreased (0.039399 --> 0.020696).  Saving model ...
Pretrain Epoch 6/20 - loss: 0.0174
Val Epoch 6/20 - loss: 0.0175
Validation loss decreased (0.020696 --> 0.017505).  Saving model ...
Pretrain Epoch 7/20 - loss: 0.0139
Val Epoch 7/20 - loss: 0.0126
Validation loss decreased (0.017505 --> 0.012612).  Saving model ...
Pretrain Epoch 8/20 - loss: 0.0111
Val Epoch 8/20 - loss: 0.0082
Va

In [35]:
model.load_state_dict(torch.load('/kaggle/working/checkpoint.pt'))

<All keys matched successfully>

For LMA, we first fine-tuned the PLMs on a small amount of task-specific data using a masked language modeling objective. This involves randomly masking some tokens in the input sequence and training the model to predict the masked tokens based on the context provided by the unmasked tokens. This process helps to adapt the PLM to the specific characteristics of the target task and improve its performance on that task.

# Finetuning

In [36]:
# set up the data loader
train_data = WebNLGDataset(dataset)
batch_size = 32 #16
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = load_dataset('web_nlg', 'release_v3.0_en')['dev']
val_data = WebNLGDataset(val_data)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

  0%|          | 0/3 [00:00<?, ?it/s]

In [37]:
# set up the optimizer and the loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) #3e-5
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
early_stopping = EarlyStopping(patience=2, verbose=True)


In [38]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=targets)
        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_data)
    print(f"Train loss: {epoch_loss:.4f}")
    
    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for val_inputs, val_targets in val_loader:
            val_inputs = val_inputs.to(device)
            val_targets = val_targets.to(device)
            val_outputs = model(val_inputs, labels=val_targets)
            val_loss = criterion(val_outputs.logits.view(-1, val_outputs.logits.size(-1)), val_targets.view(-1))
            running_val_loss += val_loss.item() * val_inputs.size(0)
    epoch_val_loss = running_val_loss / len(val_data)
    print(f"Val loss: {epoch_val_loss:.4f}")
    
    early_stopping(epoch_val_loss, model)
    
    if early_stopping.early_stop:
        print("Early stopping")
        break


Train loss: 1.1287
Val loss: 0.7849
Validation loss decreased (inf --> 0.784888).  Saving model ...
Train loss: 0.9147
Val loss: 0.7324
Validation loss decreased (0.784888 --> 0.732428).  Saving model ...
Train loss: 0.8472
Val loss: 0.7179
Validation loss decreased (0.732428 --> 0.717920).  Saving model ...
Train loss: 0.8017
Val loss: 0.6881
Validation loss decreased (0.717920 --> 0.688066).  Saving model ...
Train loss: 0.7659
Val loss: 0.6740
Validation loss decreased (0.688066 --> 0.673992).  Saving model ...
Train loss: 0.7373
Val loss: 0.6628
Validation loss decreased (0.673992 --> 0.662761).  Saving model ...
Train loss: 0.7107
Val loss: 0.6637
EarlyStopping counter: 1 out of 2
Train loss: 0.6907
Val loss: 0.6525
Validation loss decreased (0.662761 --> 0.652536).  Saving model ...
Train loss: 0.6719
Val loss: 0.6430
Validation loss decreased (0.652536 --> 0.643005).  Saving model ...
Train loss: 0.6532
Val loss: 0.6424
Validation loss decreased (0.643005 --> 0.642430).  Saving 

In [39]:
model.load_state_dict(torch.load('/kaggle/working/checkpoint.pt'))

<All keys matched successfully>

In [40]:
# Save the entire model
torch.save(model, 'model_T5_flan_small_2020_v4')
print("Model saved successfully.")

Model saved successfully.


In [7]:
# Load the model
#model = torch.load('/kaggle/input/models/model_T5_flan_small_multi')

# Print a confirmation message
print("Model loaded successfully.")

Model loaded successfully.


## are we accounting for the multiple texts targets in the bleu? it doesn't look like it

In [41]:
!pip install sacrebleu
batch_size=32

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [42]:
from sacrebleu import corpus_bleu
from random import sample
from tqdm import tqdm


# load the WebNLG validation dataset
validation_dataset = load_dataset('web_nlg', 'release_v3.0_en')['test']
validation_dataset = [sample for sample in validation_dataset if sample['lex']['text']] # filter out samples with empty targets 
# Select a subset of the validation dataset
#subset_size = 10  # Choose the desired subset size
#validation_subset = sample(list(validation_dataset), subset_size)
validation_data = WebNLGDataset(validation_dataset)

# set up the validation data loader
validation_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

# switch model to evaluation mode
model.eval()

# generate predictions for the validation dataset
predictions = []
references = []
with torch.no_grad():
    for inputs, targets in tqdm(validation_loader, desc='Validation Progress', leave=False):
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model.module.generate(inputs, max_length=MAX_TARGET_LENGTH, num_beams=4)
        # convert token IDs to strings
        predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        target_texts = tokenizer.batch_decode(targets, skip_special_tokens=True)
        # append predicted and target texts for BLEU evaluation
        predictions.extend(predicted_texts)
        references.extend(target_texts)


  0%|          | 0/3 [00:00<?, ?it/s]

                                                                      

In [43]:
# Writing predictions to a .txt file
with open("predictions", "w") as f:
    for prediction in predictions:
        f.write(prediction + "\n")

In [44]:
# calculate BLEU scores
#bleu = corpus_bleu(predictions, [references])

multiple_references = []
for i in range(len(validation_dataset)):
    multiple_references.append(validation_dataset[i]['lex']['text'])
    
bleu = corpus_bleu(predictions, references)
bleu_multiple = corpus_bleu(predictions, multiple_references)

print(f"BLEU score: {bleu.score}")
print(f"BLEU score with multiple references: {bleu_multiple.score}")

BLEU score: 0.1354638113124636
BLEU score with multiple references: 94.149097734812


In [22]:
# Getting the maximum length of the sublists in multiple_references
max_length = max(len(sublist) for sublist in multiple_references)

# Writing multiple_references to separate .txt files
for i in range(max_length):
    with open(f"references{i}", "w") as f:
        for ref_list in multiple_references:
            # Writing the ith element if it exists, otherwise an empty line
            if i < len(ref_list):
                f.write(ref_list[i] + "\n")
            else:
                f.write("\n")

In [18]:
# calculate BLEU scores
#bleu = corpus_bleu(predictions, [references])

multiple_references = []
for i in range(len(validation_dataset)):
    multiple_references.append(validation_dataset[i]['lex']['text'])
    
# First, determine the maximum length of sublists
max_len = max(len(refs) for refs in multiple_references)

# Then pad all sublists to that length
padded_references = [refs * (max_len // len(refs)) + refs[:max_len % len(refs)] for refs in multiple_references]
    
bleu = corpus_bleu(predictions, references)
bleu_multiple = corpus_bleu(predictions, padded_references)

print(f"BLEU score: {bleu.score}")
print(f"BLEU score with padded references: {bleu_multiple.score}")

BLEU score: 0.16214501382472107
BLEU score with padded references: 80.7204465338761


In [19]:
from datasets import load_metric

metric = load_metric('sacrebleu')

# First, determine the maximum length of sublists
max_len = max(len(refs) for refs in multiple_references)

# Then pad all sublists to that length
padded_references = [refs * (max_len // len(refs)) + refs[:max_len % len(refs)] for refs in multiple_references]

# Now 'padded_references' is a list of lists, where each sublist has the same length.
# We can now compute the SacreBLEU score.

# Note the change in the compute line
score = metric.compute(predictions=predictions, references = padded_references)

print(f"SacreBLEU score: {score['score']}")

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

SacreBLEU score: 31.99615265661011


In [20]:
from sacrebleu import corpus_chrf
# Calculate CHR F++ scores
chrf = corpus_chrf(predictions, [references])
chrf_multiple = corpus_chrf(predictions, multiple_references)
print(f"CHR F++ score: {chrf.score}")
print(f"CHR F++ score with multiple references: {chrf_multiple.score}")

CHR F++ score: 54.00287217727219
CHR F++ score with multiple references: 77.50627157244912


In [21]:
from sacrebleu import corpus_chrf
# Calculate CHR F++ scores
chrf = corpus_chrf(predictions, [references])
chrf_multiple = corpus_chrf(predictions, padded_references)
print(f"CHR F++ score: {chrf.score}")
print(f"CHR F++ score with multiple references: {chrf_multiple.score}")

CHR F++ score: 54.00287217727219
CHR F++ score with multiple references: 70.48380146771281


In [23]:
!pip install bert_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13
[0m

In [45]:
### from datasets import load_metric
import numpy as np


metric = load_metric('bertscore')

assert len(predictions) == len(references), "The number of predictions and references should be the same."

# Compute the score
score = metric.compute(predictions=predictions, references=references, lang='en')

print(f"BERTScore: {np.mean(score['precision'])}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: 0.9123497803803925


In [46]:
i=5
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Bootleg_Series_Volume_1:_The_Quine_Tapes | precededBy | Squeeze_(The_Velvet_Underground_album)', 'Squeeze_(The_Velvet_Underground_album) | followedBy | 1969:_The_Velvet_Underground_Live']]
----
The book "BoodlegSeries Volume 1: The Quiet Underground" was preceded by "Squeeze", which was followed by "1969: The Velvet Underground Live" and "Squeeze".
----
['The album 1969: The Velvet Underground Live is preceded by the Velvet Underground album Squeeze, which was followed by The Quine Tapes.', 'The Velvet Underground album Bootleg Series Volume 1: The Quine Tapes was preceded by the album Squeeze, which was followed by the live album 1969: The Velvet Underground Live.', 'The Bootleg Series Volume I: The Quine Tapes is preceded by the Velvet Underground album Squeeze which was itself followed by the album 1969: The Velvet Underground Live.']


In [47]:
i=10
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Piotr_Hallmann | birthDate | 1987-08-25']]
----
Piotr Hallmann was born on the 25th of August 1987 and was born on the 25th of August 1987.
----
['Piotr Hallmann was born on August 25, 1987.', "Piotr Hallmann's birthday is August 25th 1987.", 'Piotr Hallmann was born on the 25th of August 1987.']


In [48]:
i=50
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Alan_Shepard | birthDate | "1923-11-18"', 'Alan_Shepard | deathPlace | California', 'Alan_Shepard | birthPlace | New_Hampshire', 'Alan_Shepard | mission | Apollo_14']]
----
Alan Shepard was born in New Hampshire on November 18th, 1923. He was a crew member of Apollo 14 and died in California. He was born in New Hampshire on November 18th, 1923.
----
['Alan Shepard was a crew member of Apollo 14 who was born November 18th, 1923, in New Hampshire and died in California.', 'Alan Shepard was born in New Hampshire on November 18, 1923. He was a crew member of Apollo 14, and died later on in California.', 'Alan Shepard was born on Nov 18, 1923 in New Hampshire, was a member of the Apollo 14 crew and died in California.']


In [49]:
i=0
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Estádio_Municipal_Coaracy_da_Mata_Fonseca | location | Arapiraca', 'Agremiação_Sportiva_Arapiraquense | league | Campeonato_Brasileiro_Série_C', 'Campeonato_Brasileiro_Série_C | country | Brazil', 'Agremiação_Sportiva_Arapiraquense | nickname | "\'\'Alvinegro"', 'Agremiação_Sportiva_Arapiraquense | ground | Estádio_Municipal_Coaracy_da_Mata_Fonseca']]
----
Estádio Municipal Coaracy da Mata Fonseca is located in Arapiraca, Brazil. The club play in the Campeonato Brasileiro Série C league which is based in Brazil.
----
['Estádio Municipal Coaracy da Mata Fonseca is the name of the ground of Agremiação Sportiva Arapiraquense in Arapiraca. Agremiação Sportiva Arapiraquense, nicknamed "Alvinegro", lay in the Campeonato Brasileiro Série C league from Brazil.', 'Estádio Municipal Coaracy da Mata Fonseca is the name of the ground of Agremiação Sportiva Arapiraquense in Arapiraca. Alvinegro, the nickname of Agremiação Sportiva Arapiraquense, play in the Campeonato Brasileiro Série C league f

In [50]:
i=70
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Pontiac_Rageous | assembly | Michigan', 'Pontiac_Rageous | assembly | Detroit', 'Pontiac_Rageous | productionEndYear | 1997']]
----
Pontiac Ridgeous was assembled in Michigan in the year 1997 and finished its production in the year 1997. It is located in Detroit and was involved in the production of the Pontiac Rigous.
----
['The Pontiac Rageous assembled in Michigan with assembly line in Detroit was last produced in 1997.', 'Ending its production in 1997, the Pontiac Rageous was assembled in Detroit, Michigan.', 'Ending in 1997, the Pontiac Rageous was assembled in Detroit, Michigan.']


In [51]:
i=130
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['McVeagh_of_the_South_Seas | director | Harry_Carey_(actor_born_1878)', 'McVeagh_of_the_South_Seas | writer | Harry_Carey_(actor_born_1878)', 'McVeagh_of_the_South_Seas | producer | The_Progressive_Motion_Picture_Company']]
----
The producer of McVeagh of the South Seas is The Progressive Movement Pictures Company. McVeagh of the South Seas was created by Harry Reid, who was born in 1888. McVeagh of the South Seas was created by The Progressive Movement Pictures Company.
----
['"McVeagh of the South Seas" was written and directed by Harry Carey born in 1878 and produced by Progressive Motion Picture Company.', 'Harry Carey, an actor born in 1878, was the director and writer for the film McVeagh of the South Seas which was produced by the Progressive Motion Picture Company.', 'Harry Carey, an actor and director, was born in 1878. He was the writer of the movie McVeagh of the South Seas, which was produced by the Progressive Motion Picture Company. As well as writing the film script, C

In [52]:
i=1861
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Akeem_Ayers | currentteam | "Los Angeles Rams"', 'Akeem_Ayers | debutTeam | Tennessee_Titans']]
----
Akeem Ayers made his debut for the Tennessee Titans and played for the Los Angeles Rams. He has also played for the Los Angeles Rams and the Los Angeles Rams.
----
['Akeem Ayers made his debut for the Tennessee Titans and currently plays for the Los Angeles Rams.']


In [53]:
i=1860
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Turn_Me_On_(album) | runtime | 35.1', 'Turn_Me_On_(album) | artist | The_Honeymoon_Killers_(American_band)', 'Turn_Me_On_(album) | genre | Punk_blues', 'Turn_Me_On_(album) | producer | The_Honeymoon_Killers_(American_band)', 'Turn_Me_On_(album) | genre | Noise_rock', 'Turn_Me_On_(album) | precededBy | Let_It_Breed']]
----
The rapper, The Honeymoon Millers, is a performer of the Punk Blues. It was produced by The Honeymoon Millers and has a run time of 35.1 minutes.
----
['Turn Me On by the Honeymoon Killers is a punk blues album in the noise rock genre. The run time is 35.1 minutes and was preceded by the Let it Breed album.']


In [54]:
i=-10
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['McVeagh_of_the_South_Seas | imdbId | 0004319', 'McVeagh_of_the_South_Seas | director | Cyril_Bruce', 'McVeagh_of_the_South_Seas | director | Harry_Carey_(actor_born_1878)', 'McVeagh_of_the_South_Seas | starring | Harry_Carey_(actor_born_1878)', 'McVeagh_of_the_South_Seas | writer | Harry_Carey_(actor_born_1878)', 'McVeagh_of_the_South_Seas | producer | The_Progressive_Motion_Picture_Company']]
----
Cyril Bruce and Harry Carey stars in McVeagh of the South Seas. McVeagh of the South Seas was created by Cyril Bruce and was starred by Harry Carey. McVeagh of the South Seas was created by Cyril Bruce and was starred by Cyril Bruce.
----
['Actor Harry Carey, born in 1878, and Cyril Bruce directed McVeagh of the South Seas which was registered in IMDb with the ID 0004319. Harry Carey also wrote and acted in the movie which was produced by the Progressive Motion Picture Company.']


In [55]:
i=-9
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Mermaid_(Train_song) | recordLabel | Columbia_Records', 'Mermaid_(Train_song) | recordLabel | Sony_Music_Entertainment', "Mermaid_(Train_song) | precededBy | This'll_Be_My_Year", 'Mermaid_(Train_song) | runtime | 3.16', 'Mermaid_(Train_song) | genre | Reggae']]
----
Mermaid is a musical sequel to This'll Be My Year and was recorded by Columbia Records. It has a run time of 3.16 and was preceded by This'll Be My Year.
----
['The Train song Mermaid is a 3.16 minute song in the reggae genre which is on Columbia Records. It had been released under the Sony Music Entertainment label and was preceded by "This\'ll Be my Year."']


In [56]:
i=-5
print(validation_dataset[i]['modified_triple_sets']['mtriple_set'])
print('----')
print(predictions[i])
print('----')
print(multiple_references[i])

[['Trane | type | Subsidiary', 'Trane | foundingDate | 1913-01-01', 'Trane | location | Ireland', 'Trane | foundationPlace | La_Crosse,_Wisconsin', 'Trane | numberOfEmployees | 29000', 'Trane | product | HVAC', 'Trane | industry | Building_materials']]
----
Trane was founded in La Crosse, Wisconsin on January 1, 1913. Trane is a subsidiary company in the building materials industry. Trane was founded on January 1, 1913 in La Crosse, Wisconsin. Trane is a subsidiary company in the building materials industry. Trane is a subsidiary company in the building materials industry.
----
['Trane is a subsidiary company that was founded in La Crosse, Wisconsin on 1913-01-01 but later moved to Ireland. Their 29,000 employees produce building materials, prominent among which are HVAC products.']


In [15]:
from transformers import T5Tokenizer
from datasets import load_dataset

# Define the tokenizer
#tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Load the WebNLG dataset
dataset = load_dataset('web_nlg', 'release_v3.0_en')['test']
dataset = [sample for sample in dataset if sample['lex']['text']]

# Create an instance of WebNLGDataset
webnlg_dataset = WebNLGDataset(dataset)

# Define the index of the example you want to test
example_index = 70

# Get the input and target texts for the example at the specified index
input_text, target_text = webnlg_dataset[example_index]

# Decode the input and target texts using the tokenizer
decoded_input_text = tokenizer.decode(input_text, skip_special_tokens=True)
decoded_target_text = tokenizer.decode(target_text, skip_special_tokens=True)

# Print the preprocessed input and target texts
print("Input Text:", decoded_input_text)
print("Target Text:", decoded_target_text)


  0%|          | 0/3 [00:00<?, ?it/s]

{'category': 'MeanOfTransportation', 'size': 1, 'eid': 'Id71', 'original_triple_sets': {'otriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema'], ['Alfa_Romeo_164 | related | Lancia_Thema']]}, 'modified_triple_sets': {'mtriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Alfa Romeo 164 and Lancia Thema are related types of transportation.', 'The related transport to the Alfa Romeo 164 is the Lancia Thema.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
translate from Graph to Text:  <H> Alfa_Romeo_164  <R>  relatedMeanOfTransportation  <T>  Lancia_Thema <H> Alfa_Romeo_164  <R>  related  <T>  Lancia_Thema
Input Text: translate from Graph to Text: Alfa_Romeo_164 relatedMeanOfTransportation Lancia_Thema Alfa_Romeo_164 related Lancia_Thema
Target Text: Alfa Romeo 164 and Lancia Thema are

## seeing how many empty targets there are in the testing set

In [18]:
dataset = load_dataset('web_nlg', 'release_v3.0_en')['test']
count_empty_text = 0
for sample in dataset:
    if not sample['lex']['text']:
        count_empty_text += 1

print(f"Number of samples with empty 'lex' 'text' field: {count_empty_text}")


  0%|          | 0/3 [00:00<?, ?it/s]

Number of samples with empty 'lex' 'text' field: 1862


In [19]:
total_samples = len(dataset)
print(f"Total number of samples in the test dataset: {total_samples}")

Total number of samples in the test dataset: 4615
