In [None]:
# !rm -rf /content/sample_data
# !unzip "/kaggle/input/webnlg/webnlg-dataset-master-release_v3.0.zip"
!mkdir /kaggle/working/data
!cp -r /kaggle/input/webnlg3/webnlg-dataset-master-release_v3.0/release_v3.0/en/* /kaggle/working/data
!pip install sentencepiece
!pip install transformers
!pip install deep_translator

In [None]:
import os
from xml.etree import ElementTree as ET
from collections import Counter
from functools import reduce
import operator
from torch.utils.data import IterableDataset,DataLoader
from tqdm import tqdm
import random
import re
import numpy as np
import random
from deep_translator import GoogleTranslator
import pandas as pd

In [None]:
def get_triple_and_text(filepath='/kaggle/working/data/en/train/7triples/Astronaut.xml'):
    '''
    Function that reads an xml file and generates triple text pairs.
    Arguments
      -- filepath:str
         path to the xml file
    Returns
    A generator that yields a tuple of tripleset and corresponding reference texts.
    '''
    triples, texts = [], []
    #parsing the xml file 
    for _, elem in ET.iterparse(filepath, events=("end",)):      
        if elem.tag.endswith("entry"):
            # At the end of every entry tag return triple set and texts
            # collected so far and clear accumulators
            yield triples, texts
            triples, texts = [], []
            triple, text = None, None
        elif elem.tag.endswith("lex"):
            # Each tag ending with lex contains reference texts for that entry
            text = elem.text
            texts.append(text)
        elif elem.tag.endswith("mtriple"):
            # Each tag ending with mtriple contains one triple for that entry
            triple = elem.text
            triples.append(triple)
        else:
            #Ignore all other tags
            pass
        elem.clear()

In [None]:
class dataset(IterableDataset):
    '''
    Dataset class inherited from pytorch iterabledataset.
    It works with generator instead of providing completed data set. 
    '''
    def __init__(self,sampler):
        '''
         Arguments
         -- sampler: generator object
            A generator that yields one sample at a time.
        '''
        self.sampler = sampler
    def __iter__(self):
        return self.sampler()

def get_data_loader(root,datasplit,batch_size=4):
    '''
    This Function generates a data loader.
    Arguments
         -- root: str
            Root directory of the dataset
         -- datasplit: str
            Control which data split to use. 
            It can take values "train","dev or "test"
        -- batch_size: int
            Batch of the data loader.
    Returns
         -- Data Loader object
    
    '''
    def get_data_stream_generator(root,datasplit):
        '''
        Generates one clean seq2seq sample
        Arguments
         -- root: str
            Root directory of the dataset
         -- datasplit: str
            Control which data split to use. 
            It can take values "train","dev or "test"
        Returns
         -- Generator that yields one sample on each call 
        '''
        # Get all file names containing data
        files = [path+'/'+file for path,subdir,files in os.walk(root) if datasplit in path for file in files]
        # Use generator get the triples in all files.
        triples,texts = list(zip(*[list(zip(*list(get_triple_and_text(i)))) for i in tqdm(files)]))
        triples, texts = list(reduce(operator.concat,triples)), list(reduce(operator.concat,texts))

    
        def linearize(triple):
            '''
            Converts a set of triples into a sequence.
            Arguments
                -- triple: list[str]
                   List of triples in a RDF
            Returns
                A string containing information from triples
            '''
            return 'T '+' T '.join(['<S> '+each.replace('|','<R>',1).replace('|','<O>',1) for each in triple])

        def get_sample():
            '''
            Utility function that shuffles the data.
            '''
            indexes=list(range(len(triples)))
            random.shuffle(indexes)
            for l in [(linearize(triples[i]),k) for i in indexes for k in texts[i]]:yield l
        return get_sample

    sampler = get_data_stream_generator(root,datasplit)
    data = dataset(sampler)
    loader = DataLoader(data,batch_size=batch_size)
    return loader

# Get Data Loader for training and validation
train_loader = get_data_loader('/kaggle/working/data','train',16)
val_loader = get_data_loader('/kaggle/working/data','dev',16)

In [None]:
import torch

# Clear GPU 
torch.cuda.empty_cache()
try:
    import gc
    model=None
    gc.collect()
except:
    pass
!nvidia-smi

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch import optim

# Loading T5Model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
#Adding Special tokens
tokenizer.add_tokens(['<s>', '<R>','<O>'])
model.resize_token_embeddings(len(tokenizer))

#Preparing model for training
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adagrad(model.parameters(),lr =1e-3)

In [None]:
def validate(model,valloader):
    '''
    Validation Loop
    Aguments
        -- model: model object
        -- valloader: Validation Data Loader
    '''
    model.eval()
    losses=[]
    #Evaluation Loop
    for data in valloader:
        input_string,target_string = data
        encoding = tokenizer(input_string,
                          padding='longest',
                          max_length=75,
                          truncation=True,
                          return_tensors="pt")
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

        # encode the targets
        target_encoding = tokenizer(target_string,
                                    padding='longest',
                                    max_length=100,
                                    truncation=True)
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100
        labels = [
                  [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
        ]
        labels = torch.tensor(labels)

        # forward pass
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        losses.append(loss.tolist())
    print('Evaluation loss ',np.array(losses).mean())

In [None]:
for epoch in tqdm(range(30)):
    losses=[]
    with torch.no_grad():
        validate(model,val_loader)
    for batch,data in enumerate(train_loader):
        input_string,target_string = data
        optimizer.zero_grad()
        encoding = tokenizer(input_string,
                          padding='longest',
                          max_length=75,
                          truncation=True,
                          return_tensors="pt")
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

        # encode the targets
        target_encoding = tokenizer(target_string,
                                    padding='longest',
                                    max_length=100,
                                    truncation=True)
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100
        # Token -100  is ignored by pytorch when calculating loss
        labels = [
                  [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
        ]
        labels = torch.tensor(labels)

        # forward pass
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        model.train()
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss

        # Backward pass
        loss.backward()
        # Gradient Update
        optimizer.step()

        losses.append(loss.tolist())
        if batch%500==499:
            print('Training Loss ',np.array(losses).mean(),' at batch ',batch,end=' | ')
            with torch.no_grad():
                validate(model,val_loader)

In [None]:
#Saving Model and tokenizer
model.save_pretrained('kaggle/working/finetuned-t5-30-model')
tokenizer.save_pretrained('kaggle/working/finetuned-t5-30-model')

In [None]:
def get_ref_targets(root,datasplit):
        '''
        Function that returns a generator  object that returns one sample at a time
        Contains multiple reference texts for one triple set.
        Arguments
            -- root:str
               Root directory of the data set
            -- datasplit: str
               DataSplit to use
        Returns
            Data generator object
        '''
        files = [path+'/'+file for path,subdir,files in os.walk(root) if datasplit in path for file in files]
        triples,texts = list(zip(*[list(zip(*list(get_triple_and_text(i)))) for i in tqdm(files)]))
        triples, texts = list(reduce(operator.concat,triples)), list(reduce(operator.concat,texts))

    
        def linearize(x):
            '''
            Converts a set of triples into a sequence.
            Arguments
                -- triple: list[str]
                   List of triples in a RDF
            Returns
                A string containing information from triples
            '''
            return 'T '+' T '.join(['<S> '+each.replace('|','<R>',1).replace('|','<O>',1) for each in x])

        def get_sample():
            '''Utility function that shuffles data.'''
            indexes=list(range(len(triples)))
            random.shuffle(indexes)
            for l in [(linearize(triples[i]),texts[i]) for i in indexes]:yield l
        return get_sample
def get_text(inp):
    '''
    Takes a linearized tripleset as input and generates Natural language text.
    Arguments
        -- inp: str
           Input Sequence
    Returns
        Model Generated text
    '''
    with torch.no_grad():
        # Tokenize input
        test_input_ids = tokenizer(inp, return_tensors='pt').input_ids
        # Send the data to gpu
        test_input_ids = test_input_ids.to(device)
        # call models generate method
        test_outputs = model.generate(test_input_ids,max_length=100)
        # Convert model output into human readable text
        test_outputs = test_outputs.to(torch.device('cpu'))
        return tokenizer.decode(test_outputs[0], skip_special_tokens=True)
test_streamer = get_ref_targets('/kaggle/working/data','dev')


In [14]:
y_pred=[]
y_true=[]
y_pred_text=[]
y_true_text=[]
inps =[]
#Inference Loop for all data samples
for data in tqdm(test_streamer()):
    inp,ref = data
    inps.append(inp)
    y=get_text(inp)
    y_pred_text.append(y)
    y_true_text.append(ref)
    y_pred.append([tokenizer.tokenize(i) for i in [y]])
    y_true.append([tokenizer.tokenize(i) for i in ref])

    

1667it [08:57,  3.10it/s]


In [18]:
# Saving Predictions
df = pd.DataFrame({'inps':inps,'y_pred_text':y_pred_text,'y_true_text':y_true_text,'y_pred':y_pred,'y_true':y_true})
df

Unnamed: 0,inps,y_pred_text,y_true_text,y_pred,y_true
0,T <S> Alan_Frew <R> associatedBand/associatedM...,Alan Frew is associated with Glass Tiger.,"[Alan Frew plays with Glass Tiger., Alan Frew ...","[[▁Alan, ▁Fre, w, ▁is, ▁associated, ▁with, ▁Gl...","[[▁Alan, ▁Fre, w, ▁plays, ▁with, ▁Glass, ▁Tige..."
1,T <S> Bakewell_pudding <R> region <O> Derbyshi...,Bakewell tart is a variation of Bakewell tart ...,[From the Derbyshire Dales region (in Derbyshi...,"[[▁Bake, well, ▁tart, ▁is, ▁, a, ▁variation, ▁...","[[▁From, ▁the, ▁Derby, shire, ▁Da, les, ▁regio..."
2,T <S> Ardmore_Airport_(New_Zealand) <R> 2ndRun...,The 2nd runway at Ardmore Airport (New Zealand...,[New Zealand's Ardmore Airport's second runway...,"[[▁The, ▁2, nd, ▁runway, ▁at, ▁Ard, more, ▁Air...","[[▁New, ▁Zealand, ', s, ▁Ard, more, ▁Airport, ..."
3,T <S> Canada <R> anthem <O> O_Canada T <S> Can...,The anthem of Canada is O Canada and the leade...,[Aaron Boogaard was born in Canada whose natio...,"[[▁The, ▁, ant, hem, ▁of, ▁Canada, ▁is, ▁O, ▁C...","[[▁Aaron, ▁Bo, oga, ard, ▁was, ▁born, ▁in, ▁Ca..."
4,T <S> Appleton_International_Airport <R> locat...,Appleton International Airport is located in G...,[The city of Appleton is served by Appleton In...,"[[▁App, leton, ▁International, ▁Airport, ▁is, ...","[[▁The, ▁city, ▁of, ▁App, leton, ▁is, ▁served,..."
...,...,...,...,...,...
1662,T <S> Antares_(rocket) <R> manufacturer <O> Yu...,"The Antares rocket, which was launched from th...",[The Antares rocket is manufactured at the Yuz...,"[[▁The, ▁Ant, are, s, ▁rocket, ,, ▁which, ▁was...","[[▁The, ▁Ant, are, s, ▁rocket, ▁is, ▁manufactu..."
1663,T <S> ACM_Transactions_on_Information_Systems ...,ACM Transactions on Information Systems (abbre...,"[ACM Transactions on Information Systems, or A...","[[▁A, CM, ▁Transaction, s, ▁on, ▁Information, ...","[[▁A, CM, ▁Transaction, s, ▁on, ▁Information, ..."
1664,T <S> Angola_International_Airport <R> locatio...,Angola International Airport is located in col...,[Angola International airport is located in Ic...,"[[▁An, gol, a, ▁International, ▁Airport, ▁is, ...","[[▁An, gol, a, ▁International, ▁airport, ▁is, ..."
1665,T <S> Ace_Wilder <R> genre <O> Hip_hop_music T...,Ace Wilder is a hip hop musician and his music...,"[Ace Wilder is an exponent of hip hop music, a...","[[▁Ace, ▁Wild, er, ▁is, ▁, a, ▁hip, ▁hop, ▁mus...","[[▁Ace, ▁Wild, er, ▁is, ▁an, ▁ex, ponent, ▁of,..."


In [19]:
df.to_csv('/kaggle/working/results.csv',index=False)

<a href="./kaggle/working/finetuned-t5-30-model/pytorch_model.bin"> Download File </a>

In [15]:
#Calculating Bleu Score
from torchtext.data.metrics import bleu_score
bleu_score([i[0] for i in y_pred], y_true)

0.5892375707626343