In [1]:

!pip install datasets

!pip install transformers 
!pip install sentencepiece

!pip install sacrebleu


[0mCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.7.0 sacrebleu-2.3.1
[0m

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.nn.parallel import DataParallel

# define the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model = DataParallel(model)




OSError: Can't load tokenizer for 'facebook/bart-base'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'facebook/bart-base' is the correct path to a directory containing all relevant files for a T5Tokenizer tokenizer.

In [29]:
new_tokens = ['<H>', '<R>', '<T>']
new_tokens_vocab = {}
new_tokens_vocab['additional_special_tokens'] = []
for idx, t in enumerate(new_tokens):
    new_tokens_vocab['additional_special_tokens'].append(t)
num_added_toks = tokenizer.add_special_tokens(new_tokens_vocab)

tokenizer.add_tokens("[MASK]")
tokenizer.mask_token = "[MASK]"
tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")

In [31]:
class WebNLGDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.prefix = "translate from Graph to Text: "

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # preprocess the input graph
        try:
            triples = item['original_triple_sets']['otriple_set']
            input_text = self.prefix
            for outer_list in triples: 
                for triple in outer_list:
                    triple_txt = triple.split("|")
                    input_text += " <H> " + triple_txt[0] + " <R> " + triple_txt[1] + " <T> " + triple_txt[2]
        except (KeyError, IndexError):
            print("1")
            print(item['original_triple_sets']['otriple_set'])
            print(item['original_triple_sets']['otriple_set'][0])
            print(triples)
            input_text = self.prefix
        # preprocess the target text
        try:
            target_text = item['lex']['text'][0]
        except (KeyError, IndexError):
            print("2")
            print(item)
            #print(item['original_triple_sets']['otriple_set'])
            target_text = ""
        #print(item)
        #print(input_text)
        # encode the inputs and targets using the tokenizer
        input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        target_ids = tokenizer.encode(target_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        return input_ids.squeeze(0), target_ids.squeeze(0)


In [32]:
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
tokenizer.model_max_length = MAX_INPUT_LENGTH
model.module.config.max_length = MAX_TARGET_LENGTH

# set up the device (GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

DataParallel(
  (module): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_featur

In [33]:
# load the WebNLG dataset
dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')['train']

  0%|          | 0/3 [00:00<?, ?it/s]

# Adaptive pretraining

For STA, we fine-tuned the PLMs on a small amount of labeled data from the target task using a maximum likelihood estimation (MLE) objective. This involves training the model to maximize the likelihood of generating the correct output given the input graph and labeled data. This process helps to further adapt the PLM to the specific requirements of the target task and improve its performance on that task.

In [35]:
import random

pretrain_texts = []
for sample in dataset:
    try:
        text = sample['lex']['text'][0]
        pretrain_texts.append(text)
    except (KeyError, IndexError):
        continue

tokenized_inputs = tokenizer(pretrain_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids = tokenized_inputs['input_ids']
attention_mask = tokenized_inputs['attention_mask']

pretrain_data = torch.utils.data.TensorDataset(input_ids, attention_mask)

pretrain_loader = torch.utils.data.DataLoader(pretrain_data, batch_size=int(60), shuffle=True)

pretrain_optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
pretrain_criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

pretrain_epochs = 2  # Set the number of pre-training epochs
masking_prob = 0.15  # Probability of masking a token

if tokenizer.mask_token is None:
    # Manually set a mask token if not already defined
    tokenizer.add_tokens("[MASK]")
    tokenizer.mask_token = "[MASK]"
    tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")

for epoch in range(pretrain_epochs):
    running_loss = 0.0
    for inputs, attention_mask in pretrain_loader:
        inputs = inputs.to(device)
        attention_mask = attention_mask.to(device)
        batch_size, seq_length = inputs.shape
        
        # Create a mask for randomly selected tokens
        mask = torch.rand(inputs.shape) < masking_prob
        
        # Randomly replace selected tokens with [MASK] token
        masked_inputs = inputs.clone()
        masked_inputs[mask] = tokenizer.mask_token_id
        
        pretrain_optimizer.zero_grad()
        outputs = model(input_ids=masked_inputs, attention_mask=attention_mask, decoder_input_ids=inputs)
        
        # Compute the loss only for the masked tokens
        masked_logits = outputs.logits[mask]
        masked_labels = inputs[mask]
        loss = pretrain_criterion(masked_logits.view(-1, masked_logits.size(-1)), masked_labels.view(-1))
        
        loss.backward()
        pretrain_optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(pretrain_data)
    print(f"Pretrain Epoch {epoch+1}/{pretrain_epochs} - loss: {epoch_loss:.4f}")


Pretrain Epoch 1/2 - loss: 2.0749
Pretrain Epoch 2/2 - loss: 0.2933


For LMA, we first fine-tuned the PLMs on a small amount of task-specific data using a masked language modeling objective. This involves randomly masking some tokens in the input sequence and training the model to predict the masked tokens based on the context provided by the unmasked tokens. This process helps to adapt the PLM to the specific characteristics of the target task and improve its performance on that task.

# Finetuning

In [36]:
# set up the data loader
train_data = WebNLGDataset(dataset)
batch_size = 32 #16
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [37]:
# set up the optimizer and the loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) #3e-5
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)


In [38]:
# fine-tune the model
num_epochs = 2
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=targets)
        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_data)
    print(f"Epoch {epoch+1}/{num_epochs} - loss: {epoch_loss:.4f}")




Epoch 1/2 - loss: 1.7436
Epoch 2/2 - loss: 1.1279


In [46]:
# Save the entire model
torch.save(model, 'model_with_CCE_masked_pretraining_multipe_triples_43_parallel')
print("Model saved successfully.")

Model saved successfully.


In [40]:
# Load the model
#model = torch.load('/kaggle/input/models/model_with_CCE_masked_pretraining_multipe_triples_71')

# Print a confirmation message
print("Model loaded successfully.")

Model loaded successfully.


## are we accounting for the multiple texts targets in the bleu? it doesn't look like it

In [41]:
!pip install sacrebleu

[0m

In [42]:
batch_size=32

In [43]:
from sacrebleu import corpus_bleu
from random import sample
from tqdm import tqdm


# load the WebNLG validation dataset
validation_dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')['test']
validation_dataset = [sample for sample in validation_dataset if sample['lex']['text']] # filter out samples with empty targets 
# Select a subset of the validation dataset
#subset_size = 10  # Choose the desired subset size
#validation_subset = sample(list(validation_dataset), subset_size)
validation_data = WebNLGDataset(validation_dataset)

# set up the validation data loader
validation_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

# switch model to evaluation mode
model.eval()

# generate predictions for the validation dataset
predictions = []
references = []
with torch.no_grad():
    for inputs, targets in tqdm(validation_loader, desc='Validation Progress', leave=False):
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model.module.generate(inputs, max_length=MAX_TARGET_LENGTH, num_beams=4)
        # convert token IDs to strings
        predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        target_texts = tokenizer.batch_decode(targets, skip_special_tokens=True)
        # append predicted and target texts for BLEU evaluation
        predictions.extend(predicted_texts)
        references.extend(target_texts)


  0%|          | 0/3 [00:00<?, ?it/s]

                                                                    

In [44]:
# calculate BLEU scores
#bleu = corpus_bleu(predictions, [references])

multiple_references = []
for i in range(len(validation_dataset)):
    multiple_references.append(validation_dataset[i]['lex']['text'])
    
bleu = corpus_bleu(predictions, references)
bleu_multiple = corpus_bleu(predictions, multiple_references)

print(f"BLEU score: {bleu.score}")
print(f"BLEU score with multiple references: {bleu_multiple.score}")

BLEU score: 0.4971592134485973
BLEU score with multiple references: 43.47208719449915


In [45]:
from sacrebleu import corpus_chrf 
bleu_multiple = corpus_bleu(predictions, multiple_references)
print(f"BLEU score with multiple references: {bleu_multiple.score}")
chrf = corpus_chrf(predictions, multiple_references)
print(chrf.score)

BLEU score with multiple references: 43.47208719449915
64.80979690907658


In [97]:
import sacrebleu

# Prepare reference and hypothesis sentences
reference = [
  ['The Guiana Space Centre has its headquarters at Kourou in French Guiana.', "The Guiana Space Centre's headquarters are located in Kourou, French Guiana.", 'The headquarters of the Guiana Space Centre is in Kourou, French Guiana.']]
hypotheses = [
  'The Guiana Space Centre has its headquarters in Kourou, in the French Republic of Guyana. The Guiana Space Centre has its headquarter in Kourou, in the French Republic of Guyana.'
]

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(hypotheses, reference)
print(bleu.score)

# Calculate CHR-F score
chrf = sacrebleu.corpus_chrf(hypotheses, reference)
print(chrf.score)




18.96550847075289
62.20788243603901


In [83]:
import sacrebleu

# Prepare reference and hypothesis sentences
reference = [
    ['The cat is on the mat.', 'There is a cat on the mat.'],
    ['I love eating pizza.', 'Pizza is my favorite food.'],
    ['This is the hypothesis sentence']
]
hypotheses = [
    'The cat is sitting on the mat.',
    'I enjoy eating pizza.',
    'This is the hypothesis sentence.'
]

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(hypotheses, reference)
print(bleu.score)

# Calculate CHR-F score
chrf = sacrebleu.corpus_chrf(hypotheses, reference)
print(chrf.score)

import sacrebleu

# Prepare reference and hypothesis sentences
reference = [
     ['This is the hypothesis sentence'],
    ['The cat is on the mat.', 'There is a cat on the mat.'],
     ['I love eating pizza.', 'Pizza is my favorite food.']
]
hypotheses = [
    'The cat is sitting on the mat.',
    'I enjoy eating pizza.',
    'This is the hypothesis sentence.'
]

# Calculate BLEU score
bleu = sacrebleu.corpus_bleu(hypotheses, reference)
print(bleu.score)

# Calculate CHR-F score
chrf = sacrebleu.corpus_chrf(hypotheses, reference)
print(chrf.score)


50.000000000000014
74.02630292838671
50.000000000000014
74.02630292838671


In [84]:
from sacrebleu import corpus_chrf
# Calculate CHR F++ scores
chrf = corpus_chrf(predictions, [references])
chrf_multiple = corpus_chrf(predictions, multiple_references)
print(f"CHR F++ score: {chrf.score}")
print(f"CHR F++ score with multiple references: {chrf_multiple.score}")

CHR F++ score: 57.01406827265573
CHR F++ score with multiple references: 87.37417180259725


In [21]:
i=5
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Politician', 'size': 1, 'eid': 'Id6', 'original_triple_sets': {'otriple_set': [['Abdul_Taib_Mahmud | successor | Sulaiman_Abdul_Rahman_Taib']]}, 'modified_triple_sets': {'mtriple_set': [['Abdul_Taib_Mahmud | successor | Sulaiman_Abdul_Rahman_Taib']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good', 'good'], 'lid': ['Id1', 'Id2', 'Id3'], 'text': ["Abdul Taib Mahmud's successor was Sulaiman Abdul Rahman Taib.", 'Abdul Taib Mahmud was succeded by Sulaiman Abdul Rahman Taib.', 'The sucessor to Abdul Taib Mahmud was Sulaiman Abdul Rahman Taib.'], 'lang': ['', '', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Abdul_Taib_Mahmud | successor | Sulaiman_Abdul_Rahman_Taib']]
Sulaiman Abdul Rahman Taib is the successor of Abdul Taib Mohammed.
["Abdul Taib Mahmud's successor was Sulaiman Abdul Rahman Taib.", 'Abdul Taib Mahmud was succeded by Sulaiman Abdul Rahman Taib.', 'The sucessor to Abdul Taib Mahmud was Sulaiman Abdul

In [22]:
i=10
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Politician', 'size': 1, 'eid': 'Id11', 'original_triple_sets': {'otriple_set': [['Abner_W._Sibal | deathPlace | Alexandria,_Virginia']]}, 'modified_triple_sets': {'mtriple_set': [['Abner_W._Sibal | deathPlace | Alexandria,_Virginia']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good', 'good'], 'lid': ['Id1', 'Id2', 'Id3'], 'text': ['Abner W Sibal died in Alexandria, Virginia.', 'Abner W. Sibal died in Alexandria, Virginia.', 'Abner W Sibal died in Alexandria, Virginia.'], 'lang': ['', '', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Abner_W._Sibal | deathPlace | Alexandria,_Virginia']]
Abner W. Sibal died in Alexandria, Virginia.
['Abner W Sibal died in Alexandria, Virginia.', 'Abner W. Sibal died in Alexandria, Virginia.', 'Abner W Sibal died in Alexandria, Virginia.']


In [23]:
i=50
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Politician', 'size': 1, 'eid': 'Id51', 'original_triple_sets': {'otriple_set': [['United_States_Army | battles | Spanish–American_War'], ['United_States_Army | battle | Spanish–American_War']]}, 'modified_triple_sets': {'mtriple_set': [['United_States_Army | battles | Spanish–American_War']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good'], 'lid': ['Id1'], 'text': ['The United States Army was involved in battles in the Spanish-American War.'], 'lang': ['']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['United_States_Army | battles | Spanish–American_War'], ['United_States_Army | battle | Spanish–American_War']]
The Spanish–American War is a battle of the United States Armee.
['The United States Army was involved in battles in the Spanish-American War.']


In [30]:
i=0
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Politician', 'size': 1, 'eid': 'Id1', 'original_triple_sets': {'otriple_set': [['Aaron_S._Daggett | award | Purple_Heart']]}, 'modified_triple_sets': {'mtriple_set': [['Aaron_S._Daggett | award | Purple_Heart']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Aaron S Daggett was awarded the Purple Heart.', 'Aaron S. Daggett was awarded the Purple Heart.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Aaron_S._Daggett | award | Purple_Heart']]
Aaron S. Daggett won the Purple Heart award.
['Aaron S Daggett was awarded the Purple Heart.', 'Aaron S. Daggett was awarded the Purple Heart.']


### known error, sometimes the prompt leaks into the output 

In [32]:
i=70
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'MeanOfTransportation', 'size': 1, 'eid': 'Id71', 'original_triple_sets': {'otriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema'], ['Alfa_Romeo_164 | related | Lancia_Thema']]}, 'modified_triple_sets': {'mtriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Alfa Romeo 164 and Lancia Thema are related types of transportation.', 'The related transport to the Alfa Romeo 164 is the Lancia Thema.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema'], ['Alfa_Romeo_164 | related | Lancia_Thema']]
Alfa Romeo 164 is related to Lancia Thema.
['Alfa Romeo 164 and Lancia Thema are related types of transportation.', 'The related transport to the Alfa Romeo 164 is the Lancia Thema.']


In [22]:
i=130
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Athlete', 'size': 1, 'eid': 'Id131', 'original_triple_sets': {'otriple_set': [['Aleksandr_Prudnikov | team | FC_Amkar_Perm'], ['Aleksandr_Prudnikov | clubs | FC_Amkar_Perm']]}, 'modified_triple_sets': {'mtriple_set': [['Aleksandr_Prudnikov | club | FC_Amkar_Perm']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Aleksandr Prudnikov plays for FC Amkar Perm.', 'Aleksandr Prudnikov plays for the FC Amkar Perm football club.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Aleksandr_Prudnikov | team | FC_Amkar_Perm'], ['Aleksandr_Prudnikov | clubs | FC_Amkar_Perm']]
Aleksandr Prudnikov's club is FC Amkar Perm.
['Aleksandr Prudnikov plays for FC Amkar Perm.', 'Aleksandr Prudnikov plays for the FC Amkar Perm football club.']


In [23]:
i=1861
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Astronaut', 'size': 7, 'eid': 'Id971', 'original_triple_sets': {'otriple_set': [['William_Anders | dateOfRet | "1969-09-01"^^xsd:date', 'William_Anders | selection | 1963', 'William_Anders | timeInSpace | "8820.0"^^<http://dbpedia.org/datatype/minute>', 'William_Anders | birthDate | "1933-10-17"^^xsd:date', 'William_Anders | occupation | Fighter_pilot', 'William_Anders | birthPlace | British_Hong_Kong', 'William_Anders | mission | Apollo_8']]}, 'modified_triple_sets': {'mtriple_set': [['William_Anders | dateOfRetirement | "1969-09-01"', 'William_Anders | was selected by NASA | 1963', 'William_Anders | timeInSpace | "8820.0"(minutes)', 'William_Anders | birthDate | "1933-10-17"', 'William_Anders | occupation | Fighter_pilot', 'William_Anders | birthPlace | British_Hong_Kong', 'William_Anders | was a crew member of | Apollo_8']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good', 'good'], 'lid': ['Id1', 'Id2', 'Id3'], 'text': ['Test pilot William Anders was

In [33]:
i=1860
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Astronaut', 'size': 7, 'eid': 'Id970', 'original_triple_sets': {'otriple_set': [['William_Anders | dateOfRet | "1969-09-01"^^xsd:date', 'William_Anders | mission | Apollo_8', 'William_Anders | nationality | United_States', 'William_Anders | birthPlace | British_Hong_Kong', 'Apollo_8 | crew2Up | Buzz_Aldrin', 'Apollo_8 | crewMembers | Frank_Borman', 'Apollo_8 | operator | NASA']]}, 'modified_triple_sets': {'mtriple_set': [['William_Anders | dateOfRetirement | "1969-09-01"', 'William_Anders | was a crew member of | Apollo_8', 'William_Anders | nationality | United_States', 'William_Anders | birthPlace | British_Hong_Kong', 'Apollo_8 | backup pilot | Buzz_Aldrin', 'Apollo_8 | crewMembers | Frank_Borman', 'Apollo_8 | operator | NASA']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good', 'good'], 'lid': ['Id1', 'Id2', 'Id3'], 'text': ["William Anders was born in British Hong Kong and is a U.S Citizen. William was a member of the Apollo 8 crew (along with Frank

## there is a problem with empty target samples in the test set, we still need to check multiple triples!

In [15]:
from transformers import T5Tokenizer
from datasets import load_dataset

# Define the tokenizer
#tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Load the WebNLG dataset
dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')['test']
dataset = [sample for sample in dataset if sample['lex']['text']]

# Create an instance of WebNLGDataset
webnlg_dataset = WebNLGDataset(dataset)

# Define the index of the example you want to test
example_index = 70

# Get the input and target texts for the example at the specified index
input_text, target_text = webnlg_dataset[example_index]

# Decode the input and target texts using the tokenizer
decoded_input_text = tokenizer.decode(input_text, skip_special_tokens=True)
decoded_target_text = tokenizer.decode(target_text, skip_special_tokens=True)

# Print the preprocessed input and target texts
print("Input Text:", decoded_input_text)
print("Target Text:", decoded_target_text)


  0%|          | 0/3 [00:00<?, ?it/s]

{'category': 'MeanOfTransportation', 'size': 1, 'eid': 'Id71', 'original_triple_sets': {'otriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema'], ['Alfa_Romeo_164 | related | Lancia_Thema']]}, 'modified_triple_sets': {'mtriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Alfa Romeo 164 and Lancia Thema are related types of transportation.', 'The related transport to the Alfa Romeo 164 is the Lancia Thema.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
translate from Graph to Text:  <H> Alfa_Romeo_164  <R>  relatedMeanOfTransportation  <T>  Lancia_Thema <H> Alfa_Romeo_164  <R>  related  <T>  Lancia_Thema
Input Text: translate from Graph to Text: Alfa_Romeo_164 relatedMeanOfTransportation Lancia_Thema Alfa_Romeo_164 related Lancia_Thema
Target Text: Alfa Romeo 164 and Lancia Thema are

## seeing how many empty targets there are in the testing set

In [18]:
dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')['test']
count_empty_text = 0
for sample in dataset:
    if not sample['lex']['text']:
        count_empty_text += 1

print(f"Number of samples with empty 'lex' 'text' field: {count_empty_text}")


  0%|          | 0/3 [00:00<?, ?it/s]

Number of samples with empty 'lex' 'text' field: 1862


In [19]:
total_samples = len(dataset)
print(f"Total number of samples in the test dataset: {total_samples}")

Total number of samples in the test dataset: 4615
