In [1]:
'''
!pip install datasets

!pip install transformers 
!pip install sentencepiece

!pip install sacrebleu
'''

'\n!pip install datasets\n\n!pip install transformers \n!pip install sentencepiece\n\n!pip install sacrebleu\n'

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.nn.parallel import DataParallel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
model = DataParallel(model)


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
new_tokens = ['<H>', '<R>', '<T>']
new_tokens_vocab = {}
new_tokens_vocab['additional_special_tokens'] = []
for idx, t in enumerate(new_tokens):
    new_tokens_vocab['additional_special_tokens'].append(t)
num_added_toks = tokenizer.add_special_tokens(new_tokens_vocab)

tokenizer.add_tokens("[MASK]")
tokenizer.mask_token = "[MASK]"
tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")

In [None]:
class WebNLGDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.prefix = "translate from Graph to Text: "

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # preprocess the input graph
        try:
            triples = item['original_triple_sets']['otriple_set']
            input_text = self.prefix
            for outer_list in triples: 
                for triple in outer_list:
                    triple_txt = triple.split("|")
                    input_text += " <H> " + triple_txt[0] + " <R> " + triple_txt[1] + " <T> " + triple_txt[2]
        except (KeyError, IndexError):
            print("1")
            print(item['original_triple_sets']['otriple_set'])
            print(item['original_triple_sets']['otriple_set'][0])
            print(triples)
            input_text = self.prefix
        # preprocess the target text
        try:
            target_text = item['lex']['text'][0]
        except (KeyError, IndexError):
            print("2")
            print(item)
            #print(item['original_triple_sets']['otriple_set'])
            target_text = ""
        #print(item)
        #print(input_text)
        # encode the inputs and targets using the tokenizer
        input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        target_ids = tokenizer.encode(target_text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        return input_ids.squeeze(0), target_ids.squeeze(0)


In [None]:
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
tokenizer.model_max_length = MAX_INPUT_LENGTH
model.module.config.max_length = MAX_TARGET_LENGTH

# set up the device (GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [2]:
# load the WebNLG dataset
dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')['train']

Downloading builder script:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading and preparing dataset web_nlg/webnlg_challenge_2017 (download: 24.32 MiB, generated: 8.99 MiB, post-processed: Unknown size, total: 33.31 MiB) to /root/.cache/huggingface/datasets/web_nlg/webnlg_challenge_2017/0.0.0/28ffb892f7f42450dd9558684aa43bcaf44b1b3bf0d77cb8d73534646af88dda...


Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/6940 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4615 [00:00<?, ? examples/s]

Dataset web_nlg downloaded and prepared to /root/.cache/huggingface/datasets/web_nlg/webnlg_challenge_2017/0.0.0/28ffb892f7f42450dd9558684aa43bcaf44b1b3bf0d77cb8d73534646af88dda. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
dataset[0]['lex']['text']

['The Aarhus is the airport of Aarhus, Denmark.',
 'Aarhus Airport serves the city of Aarhus, Denmark.']

In [9]:
for i in range(len(dataset)):
    if dataset[i]['category'] == "Politician":
        print(dataset[i]['lex']['text'])
        print(' ')

# Adaptive pretraining

For STA, we fine-tuned the PLMs on a small amount of labeled data from the target task using a maximum likelihood estimation (MLE) objective. This involves training the model to maximize the likelihood of generating the correct output given the input graph and labeled data. This process helps to further adapt the PLM to the specific requirements of the target task and improve its performance on that task.

In [7]:
import random

pretrain_texts = []
for sample in dataset:
    try:
        text = sample['lex']['text'][0]
        pretrain_texts.append(text)
    except (KeyError, IndexError):
        continue

tokenized_inputs = tokenizer(pretrain_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids = tokenized_inputs['input_ids']
attention_mask = tokenized_inputs['attention_mask']

pretrain_data = torch.utils.data.TensorDataset(input_ids, attention_mask)

pretrain_loader = torch.utils.data.DataLoader(pretrain_data, batch_size=int(60), shuffle=True)

pretrain_optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
pretrain_criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

pretrain_epochs = 2  # Set the number of pre-training epochs
masking_prob = 0.15  # Probability of masking a token

if tokenizer.mask_token is None:
    # Manually set a mask token if not already defined
    tokenizer.add_tokens("[MASK]")
    tokenizer.mask_token = "[MASK]"
    tokenizer.mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")

for epoch in range(pretrain_epochs):
    running_loss = 0.0
    for inputs, attention_mask in pretrain_loader:
        inputs = inputs.to(device)
        attention_mask = attention_mask.to(device)
        batch_size, seq_length = inputs.shape
        
        # Create a mask for randomly selected tokens
        mask = torch.rand(inputs.shape) < masking_prob
        
        # Randomly replace selected tokens with [MASK] token
        masked_inputs = inputs.clone()
        masked_inputs[mask] = tokenizer.mask_token_id
        
        pretrain_optimizer.zero_grad()
        outputs = model(input_ids=masked_inputs, attention_mask=attention_mask, decoder_input_ids=inputs)
        
        # Compute the loss only for the masked tokens
        masked_logits = outputs.logits[mask]
        masked_labels = inputs[mask]
        loss = pretrain_criterion(masked_logits.view(-1, masked_logits.size(-1)), masked_labels.view(-1))
        
        loss.backward()
        pretrain_optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(pretrain_data)
    print(f"Pretrain Epoch {epoch+1}/{pretrain_epochs} - loss: {epoch_loss:.4f}")


Pretrain Epoch 1/2 - loss: 3.2586
Pretrain Epoch 2/2 - loss: 0.6002


For LMA, we first fine-tuned the PLMs on a small amount of task-specific data using a masked language modeling objective. This involves randomly masking some tokens in the input sequence and training the model to predict the masked tokens based on the context provided by the unmasked tokens. This process helps to adapt the PLM to the specific characteristics of the target task and improve its performance on that task.

# Finetuning

In [8]:
# set up the data loader
train_data = WebNLGDataset(dataset)
batch_size = 32 #16
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [9]:
# set up the optimizer and the loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) #3e-5
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)


In [10]:
# fine-tune the model
num_epochs = 2
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=targets)
        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_data)
    print(f"Epoch {epoch+1}/{num_epochs} - loss: {epoch_loss:.4f}")




Epoch 1/2 - loss: 0.8788
Epoch 2/2 - loss: 0.6847


In [16]:
# Save the entire model
torch.save(model, 'model_T5_flan_small_64')
print("Model saved successfully.")

Model saved successfully.


In [40]:
# Load the model
#model = torch.load('/kaggle/input/models/model_with_CCE_masked_pretraining_multipe_triples_71')

# Print a confirmation message
print("Model loaded successfully.")

Model loaded successfully.


## are we accounting for the multiple texts targets in the bleu? it doesn't look like it

In [12]:
!pip install sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [13]:
batch_size=32

In [14]:
from sacrebleu import corpus_bleu
from random import sample
from tqdm import tqdm


# load the WebNLG validation dataset
validation_dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')['test']
validation_dataset = [sample for sample in validation_dataset if sample['lex']['text']] # filter out samples with empty targets 
# Select a subset of the validation dataset
#subset_size = 10  # Choose the desired subset size
#validation_subset = sample(list(validation_dataset), subset_size)
validation_data = WebNLGDataset(validation_dataset)

# set up the validation data loader
validation_loader = DataLoader(validation_data, batch_size=batch_size, shuffle=False)

# switch model to evaluation mode
model.eval()

# generate predictions for the validation dataset
predictions = []
references = []
with torch.no_grad():
    for inputs, targets in tqdm(validation_loader, desc='Validation Progress', leave=False):
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model.module.generate(inputs, max_length=MAX_TARGET_LENGTH, num_beams=4)
        # convert token IDs to strings
        predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        target_texts = tokenizer.batch_decode(targets, skip_special_tokens=True)
        # append predicted and target texts for BLEU evaluation
        predictions.extend(predicted_texts)
        references.extend(target_texts)


  0%|          | 0/3 [00:00<?, ?it/s]

                                                                    

In [15]:
# calculate BLEU scores
#bleu = corpus_bleu(predictions, [references])

multiple_references = []
for i in range(len(validation_dataset)):
    multiple_references.append(validation_dataset[i]['lex']['text'])
    
bleu = corpus_bleu(predictions, references)
bleu_multiple = corpus_bleu(predictions, multiple_references)

print(f"BLEU score: {bleu.score}")
print(f"BLEU score with multiple references: {bleu_multiple.score}")

BLEU score: 0.4864375668364918
BLEU score with multiple references: 63.894310424627285


In [26]:
from sacrebleu import corpus_chrf
# Calculate CHR F++ scores
chrf = corpus_chrf(predictions, [references])
chrf_multiple = corpus_chrf(predictions, multiple_references)
print(f"CHR F++ score: {chrf.score}")
print(f"CHR F++ score with multiple references: {chrf_multiple.score}")

CHR F++ score: 61.33271855251572
CHR F++ score with multiple references: 62.0921446150697


In [18]:
i=5
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Politician', 'size': 1, 'eid': 'Id6', 'original_triple_sets': {'otriple_set': [['Abdul_Taib_Mahmud | successor | Sulaiman_Abdul_Rahman_Taib']]}, 'modified_triple_sets': {'mtriple_set': [['Abdul_Taib_Mahmud | successor | Sulaiman_Abdul_Rahman_Taib']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good', 'good'], 'lid': ['Id1', 'Id2', 'Id3'], 'text': ["Abdul Taib Mahmud's successor was Sulaiman Abdul Rahman Taib.", 'Abdul Taib Mahmud was succeded by Sulaiman Abdul Rahman Taib.', 'The sucessor to Abdul Taib Mahmud was Sulaiman Abdul Rahman Taib.'], 'lang': ['', '', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Abdul_Taib_Mahmud | successor | Sulaiman_Abdul_Rahman_Taib']]
Abdul Taib Mohammed was succeeded by Sulaiman Abdul Rasherman Taib.
["Abdul Taib Mahmud's successor was Sulaiman Abdul Rahman Taib.", 'Abdul Taib Mahmud was succeded by Sulaiman Abdul Rahman Taib.', 'The sucessor to Abdul Taib Mahmud was Sulaiman Abdul

In [19]:
i=10
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Politician', 'size': 1, 'eid': 'Id11', 'original_triple_sets': {'otriple_set': [['Abner_W._Sibal | deathPlace | Alexandria,_Virginia']]}, 'modified_triple_sets': {'mtriple_set': [['Abner_W._Sibal | deathPlace | Alexandria,_Virginia']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good', 'good'], 'lid': ['Id1', 'Id2', 'Id3'], 'text': ['Abner W Sibal died in Alexandria, Virginia.', 'Abner W. Sibal died in Alexandria, Virginia.', 'Abner W Sibal died in Alexandria, Virginia.'], 'lang': ['', '', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Abner_W._Sibal | deathPlace | Alexandria,_Virginia']]
Abner W. Sibal died in Alexandria, Virginia.
['Abner W Sibal died in Alexandria, Virginia.', 'Abner W. Sibal died in Alexandria, Virginia.', 'Abner W Sibal died in Alexandria, Virginia.']


In [20]:
i=50
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Politician', 'size': 1, 'eid': 'Id51', 'original_triple_sets': {'otriple_set': [['United_States_Army | battles | Spanish–American_War'], ['United_States_Army | battle | Spanish–American_War']]}, 'modified_triple_sets': {'mtriple_set': [['United_States_Army | battles | Spanish–American_War']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good'], 'lid': ['Id1'], 'text': ['The United States Army was involved in battles in the Spanish-American War.'], 'lang': ['']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['United_States_Army | battles | Spanish–American_War'], ['United_States_Army | battle | Spanish–American_War']]
The United States Army fought in the Spanish-American War.
['The United States Army was involved in battles in the Spanish-American War.']


In [21]:
i=0
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Politician', 'size': 1, 'eid': 'Id1', 'original_triple_sets': {'otriple_set': [['Aaron_S._Daggett | award | Purple_Heart']]}, 'modified_triple_sets': {'mtriple_set': [['Aaron_S._Daggett | award | Purple_Heart']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Aaron S Daggett was awarded the Purple Heart.', 'Aaron S. Daggett was awarded the Purple Heart.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Aaron_S._Daggett | award | Purple_Heart']]
Aaron S. Daggett won the Purple Heart.
['Aaron S Daggett was awarded the Purple Heart.', 'Aaron S. Daggett was awarded the Purple Heart.']


### known error, sometimes the prompt leaks into the output 

In [22]:
i=70
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'MeanOfTransportation', 'size': 1, 'eid': 'Id71', 'original_triple_sets': {'otriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema'], ['Alfa_Romeo_164 | related | Lancia_Thema']]}, 'modified_triple_sets': {'mtriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Alfa Romeo 164 and Lancia Thema are related types of transportation.', 'The related transport to the Alfa Romeo 164 is the Lancia Thema.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema'], ['Alfa_Romeo_164 | related | Lancia_Thema']]
The Alfa Romeo 164 is related to the Lancia Thema.
['Alfa Romeo 164 and Lancia Thema are related types of transportation.', 'The related transport to the Alfa Romeo 164 is the Lancia Thema.']


In [23]:
i=130
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Athlete', 'size': 1, 'eid': 'Id131', 'original_triple_sets': {'otriple_set': [['Aleksandr_Prudnikov | team | FC_Amkar_Perm'], ['Aleksandr_Prudnikov | clubs | FC_Amkar_Perm']]}, 'modified_triple_sets': {'mtriple_set': [['Aleksandr_Prudnikov | club | FC_Amkar_Perm']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Aleksandr Prudnikov plays for FC Amkar Perm.', 'Aleksandr Prudnikov plays for the FC Amkar Perm football club.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
[['Aleksandr_Prudnikov | team | FC_Amkar_Perm'], ['Aleksandr_Prudnikov | clubs | FC_Amkar_Perm']]
Aleksandr Prudnikov plays for FC Amkar Perm.
['Aleksandr Prudnikov plays for FC Amkar Perm.', 'Aleksandr Prudnikov plays for the FC Amkar Perm football club.']


In [24]:
i=1861
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Astronaut', 'size': 7, 'eid': 'Id971', 'original_triple_sets': {'otriple_set': [['William_Anders | dateOfRet | "1969-09-01"^^xsd:date', 'William_Anders | selection | 1963', 'William_Anders | timeInSpace | "8820.0"^^<http://dbpedia.org/datatype/minute>', 'William_Anders | birthDate | "1933-10-17"^^xsd:date', 'William_Anders | occupation | Fighter_pilot', 'William_Anders | birthPlace | British_Hong_Kong', 'William_Anders | mission | Apollo_8']]}, 'modified_triple_sets': {'mtriple_set': [['William_Anders | dateOfRetirement | "1969-09-01"', 'William_Anders | was selected by NASA | 1963', 'William_Anders | timeInSpace | "8820.0"(minutes)', 'William_Anders | birthDate | "1933-10-17"', 'William_Anders | occupation | Fighter_pilot', 'William_Anders | birthPlace | British_Hong_Kong', 'William_Anders | was a crew member of | Apollo_8']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good', 'good'], 'lid': ['Id1', 'Id2', 'Id3'], 'text': ['Test pilot William Anders was

In [25]:
i=1860
print(validation_dataset[i])
print(validation_dataset[i]['original_triple_sets']['otriple_set'])
print(predictions[i])
print(multiple_references[i])

{'category': 'Astronaut', 'size': 7, 'eid': 'Id970', 'original_triple_sets': {'otriple_set': [['William_Anders | dateOfRet | "1969-09-01"^^xsd:date', 'William_Anders | mission | Apollo_8', 'William_Anders | nationality | United_States', 'William_Anders | birthPlace | British_Hong_Kong', 'Apollo_8 | crew2Up | Buzz_Aldrin', 'Apollo_8 | crewMembers | Frank_Borman', 'Apollo_8 | operator | NASA']]}, 'modified_triple_sets': {'mtriple_set': [['William_Anders | dateOfRetirement | "1969-09-01"', 'William_Anders | was a crew member of | Apollo_8', 'William_Anders | nationality | United_States', 'William_Anders | birthPlace | British_Hong_Kong', 'Apollo_8 | backup pilot | Buzz_Aldrin', 'Apollo_8 | crewMembers | Frank_Borman', 'Apollo_8 | operator | NASA']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good', 'good'], 'lid': ['Id1', 'Id2', 'Id3'], 'text': ["William Anders was born in British Hong Kong and is a U.S Citizen. William was a member of the Apollo 8 crew (along with Frank

## there is a problem with empty target samples in the test set, we still need to check multiple triples!

In [15]:
from transformers import T5Tokenizer
from datasets import load_dataset

# Define the tokenizer
#tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Load the WebNLG dataset
dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')['test']
dataset = [sample for sample in dataset if sample['lex']['text']]

# Create an instance of WebNLGDataset
webnlg_dataset = WebNLGDataset(dataset)

# Define the index of the example you want to test
example_index = 70

# Get the input and target texts for the example at the specified index
input_text, target_text = webnlg_dataset[example_index]

# Decode the input and target texts using the tokenizer
decoded_input_text = tokenizer.decode(input_text, skip_special_tokens=True)
decoded_target_text = tokenizer.decode(target_text, skip_special_tokens=True)

# Print the preprocessed input and target texts
print("Input Text:", decoded_input_text)
print("Target Text:", decoded_target_text)


  0%|          | 0/3 [00:00<?, ?it/s]

{'category': 'MeanOfTransportation', 'size': 1, 'eid': 'Id71', 'original_triple_sets': {'otriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema'], ['Alfa_Romeo_164 | related | Lancia_Thema']]}, 'modified_triple_sets': {'mtriple_set': [['Alfa_Romeo_164 | relatedMeanOfTransportation | Lancia_Thema']]}, 'shape': '', 'shape_type': '', 'lex': {'comment': ['good', 'good'], 'lid': ['Id1', 'Id2'], 'text': ['Alfa Romeo 164 and Lancia Thema are related types of transportation.', 'The related transport to the Alfa Romeo 164 is the Lancia Thema.'], 'lang': ['', '']}, 'test_category': 'testdata_unseen_with_lex', 'dbpedia_links': [], 'links': []}
translate from Graph to Text:  <H> Alfa_Romeo_164  <R>  relatedMeanOfTransportation  <T>  Lancia_Thema <H> Alfa_Romeo_164  <R>  related  <T>  Lancia_Thema
Input Text: translate from Graph to Text: Alfa_Romeo_164 relatedMeanOfTransportation Lancia_Thema Alfa_Romeo_164 related Lancia_Thema
Target Text: Alfa Romeo 164 and Lancia Thema are

## seeing how many empty targets there are in the testing set

In [18]:
dataset = load_dataset('web_nlg', 'webnlg_challenge_2017')['test']
count_empty_text = 0
for sample in dataset:
    if not sample['lex']['text']:
        count_empty_text += 1

print(f"Number of samples with empty 'lex' 'text' field: {count_empty_text}")


  0%|          | 0/3 [00:00<?, ?it/s]

Number of samples with empty 'lex' 'text' field: 1862


In [19]:
total_samples = len(dataset)
print(f"Total number of samples in the test dataset: {total_samples}")

Total number of samples in the test dataset: 4615
