# Upload dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets transformers[sentencepiece]
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import pandas as pd
import numpy as np
import matplotlib as plt
import re
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration
from torch.utils.data import random_split, RandomSampler
from nltk.translate.bleu_score import sentence_bleu
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Loading pre-trained T5-base Tokenizer of T5-base Model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.to(device);

new_tokens = ['<H>', '<R>', '<T>', '<TITLE>']
new_tokens_vocab = {}
new_tokens_vocab['additional_special_tokens'] = []
for idx, t in enumerate(new_tokens):
  new_tokens_vocab['additional_special_tokens'].append(t)
num_added_toks = tokenizer.add_special_tokens(new_tokens_vocab)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
class CustomDataset(Dataset):
    def __init__(self, source_arr, target_arr, tokenizer):
        self.target_arr = target_arr
        self.source_arr = source_arr
        self.tokenizer = tokenizer 
        
        def ArrayLength():
            target_arr_length = len(self.target_arr)
            if(target_arr_length == len(self.source_arr)):
                return target_arr_length
            else:
                raise Exception("Array Lengths not Equal!!!")
                
        self.arr_len = ArrayLength()
        
    def __len__(self):
        return self.arr_len
    
    def __getitem__(self, index):
        target = self.target_arr[index]
        source = self.source_arr[index]
        
        input_ = source + "translate from Graph to Text: "
        output_ = target 
        
        #Encoding our inputs
        inputs = self.tokenizer.encode_plus(input_, pad_to_max_length=True,return_attention_mask=True, max_length=250)
        #Encoding our outputs
        outputs = self.tokenizer.encode_plus(output_, pad_to_max_length=True,return_attention_mask=True, max_length=250)
        
        input_ids = inputs['input_ids']
        input_attention_masks = inputs['attention_mask']
        
        output_ids = outputs['input_ids']
        output_attention_masks = outputs['attention_mask']
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'input_mask': torch.tensor(input_attention_masks, dtype=torch.long),
            'output_ids': torch.tensor(output_ids, dtype=torch.long),
            'output_mask': torch.tensor(output_attention_masks, dtype=torch.long)
        }   

In [9]:
%cd /content/drive/MyDrive/AGENDA/agenda_preprocess/processed
with open("training-src.txt", "r") as file:
  train_source = file.readlines()
with open("training-tgt.txt", "r") as file:
  train_target = file.readlines()
with open("test-src.txt", "r") as file:
  test_source = file.readlines()
with open("test-tgt.txt", "r") as file:
  test_target = file.readlines()
with open("dev-src.txt", "r") as file:
  val_source = file.readlines()
with open("dev-tgt.txt", "r") as file:
  val_target = file.readlines()

train_df = pd.DataFrame(list(zip(train_source, train_target)),
                        columns=['source', 'target'])
val_df = pd.DataFrame(list(zip(val_source, val_target)),
                        columns=['source', 'target'])
test_df = pd.DataFrame(list(zip(test_source, test_target)),
                        columns=['source', 'target'])

train_dataset = CustomDataset(train_df['source'], train_df['target'], tokenizer)
val_dataset = CustomDataset(val_df['source'], val_df['target'], tokenizer)
test_dataset = CustomDataset(test_df['source'], test_df['target'], tokenizer)

print("Train dataset size: ", len(train_dataset))
print("Validation dataset size: ", len(val_dataset))
print("Test dataset size: ", len(test_dataset))

batch_size = 8

train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler = RandomSampler(val_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler = RandomSampler(test_dataset), batch_size=batch_size)

/content/drive/MyDrive/AGENDA/agenda_preprocess/processed
Train dataset size:  38720
Validation dataset size:  1000
Test dataset size:  1000


# Fine-tuning

In [None]:
# ToDo: Add early stopping on the validation loss on validation set?

LEARNING_RATE = 2e-5

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

epochs = 2
steps2report = 100

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

for epoch in tqdm(range(0,epochs)):
    model.train()
    for step, data in (train_dataloader):
        optimizer.zero_grad()
        
        # Getting input parameters
        x_input_ids = data['input_ids'].to(device)
        x_masks = data['input_mask'].to(device)

        # Getting output parameters
        y_expl_ids = data['output_ids'].to(device)

        # Feeding values into our model
        outputs = model(input_ids              = x_input_ids, 
                        attention_mask         = x_masks,
                        labels                 = y_expl_ids)
        loss = outputs[0] 
        
        if step%steps2report==0:
            print("EPOCH: ", epoch, " loss.item: ", loss.item())
        
        loss.backward()
        optimizer.step() 

In [None]:
# Save the entire model
%cd '/content'
torch.save(model, 't5_AGENDA_no_pretraining_with_prefix')
print("Model saved successfully.")

# download the model
from google.colab import files
! zip t5_AGENDA_no_pretraining_with_prefix
files.download('t5_AGENDA_no_pretraining_with_prefix.zip')

# Inference

In [87]:
target_arr = test_df['target']
source_arr = test_df['source']

batch_size = 16
test_dataset = CustomDataset(source_arr, target_arr, tokenizer)
test_dataloader = DataLoader(test_dataset, sampler = RandomSampler(test_dataset), batch_size=batch_size)

In [39]:
source_arr[0]

'<TITLE> hierarchical semantic classification : word sense disambiguation with world knowledge . <H> learning architecture <R> USED-FOR <T> lexical semantic classification problems\n'

In [88]:
# so slow, maybe use larger batch size for the test loader?

import torch
from tqdm import tqdm
model.eval()

model_inputArr = []
model_outputArr = []
target_outputArr = []

with torch.no_grad():
    for step, data in tqdm(enumerate(test_dataloader)):
        input_ids = data["input_ids"].to(device)
        input_masks = data["input_mask"].to(device)
        target_ids = data["output_ids"].to(device)
        target_masks = data["output_mask"].to(device)
        output = model.generate(input_ids = input_ids, attention_mask = input_masks, max_length=200,do_sample=False )
        # convert token IDs to strings
        predicted_texts = tokenizer.batch_decode(output, skip_special_tokens=True)
        target_texts = tokenizer.batch_decode(target_ids, skip_special_tokens=True)
        input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        # for input_vector, output_vector in zip(input_ids, output):
        #     model_inputArr.append(tokenizer.decode(input_vector, skip_special_tokens=False))
        #    model_outputArr.append(tokenizer.decode(output_vector, skip_special_tokens=True))
        model_inputArr.extend(input_texts)
        target_outputArr.extend(target_texts)
        model_outputArr.extend(predicted_texts)
        
# target_outputArr = [item for sublist in target_outputArr for item in sublist]

0it [00:00, ?it/s]


OutOfMemoryError: ignored

In [76]:
data

{'input_ids': tensor([[32103,   572,   167,  ...,     0,     0,     0],
         [32103,   415,   485,  ...,     0,     0,     0],
         [32103,  2363, 27980,  ...,     0,     0,     0],
         ...,
         [32103,  2625,    18,  ...,     0,     0,     0],
         [32103,     3, 13275,  ...,     0,     0,     0],
         [32103,  5002,    13,  ...,     0,     0,     0]]),
 'input_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'output_ids': tensor([[   62, 14650,     8,  ...,     0,     0,     0],
         [16826,  2945,  1707,  ..., 16783,    30,     1],
         [   16,    48,  1040,  ...,     0,     0,     0],
         ...,
         [   16,    48,  1040,  ...,     0,     0,     0],
         [   48,  1040,  6621,  ...,     0,     0,     0],
         [ 6504, 13440,    65,  ...,     0,     0,    

In [73]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [74]:
!pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


In [75]:
from sacrebleu import corpus_bleu
from random import sample
from tqdm import tqdm

# switch model to evaluation mode
model.eval()

# generate predictions for the validation dataset
predictions = []
references = []
with torch.no_grad():
    for inputs, targets in tqdm(test_dataloader, desc='Validation Progress', leave=False):
        input_ids = data["input_ids"].to(device)
        input_masks = data["input_mask"].to(device)
        explanation_ids = data["output_ids"].to(device)
        explanation_masks = data["output_mask"].to(device)
        outputs = model.generate(inputs, max_length=200, num_beams=4, do_sample=False)
        # convert token IDs to strings
        predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        target_texts = tokenizer.batch_decode(targets, skip_special_tokens=True)
        # append predicted and target texts for BLEU evaluation
        predictions.extend(predicted_texts)
        references.extend(target_texts)

# calculate BLEU scores
#bleu = corpus_bleu(predictions, [references])

multiple_references = []
for i in range(len(validation_dataset)):
    multiple_references.append(validation_dataset[i]['lex']['text'])
bleu = corpus_bleu(predictions, references)
bleu_multiple = corpus_bleu(predictions, multiple_references)

print(f"BLEU score: {bleu.score}")
print(f"BLEU score with multiple references: {bleu_multiple.score}")




ValueError: ignored

/content
Model saved successfully.


In [None]:
! pip install sacrebleu
from sacrebleu import corpus_bleu

bleu = corpus_bleu(predictions, references)

print(f"BLEU score: {bleu.score}")


In [None]:
len(target_outputArr)

In [61]:
target_outputArr = target_outputArr[0]

In [65]:
len(model_outputArr)

1000

In [66]:
len(target_outputArr)

8

In [62]:
target_outputArr[0]

'bilingual speakers are known for their ability to code-switch or mix their languages during communication. this phenomenon occurs when bilinguals substitute a word or phrase from one language with a phrase or word from another language. for code-switching speech recognition, it is essential to collect a large-scale code-switching speech database for model training. in order to ease the negative effect caused by the data sparseness problem in training code-switching speech recognizers, this study proposes a data-driven approach to phone set construction by integrating acoustic features and cross-lingual context-sensitive articulatory features into distance measure between phone units. kl-divergence and a hierarchical phone unit clustering algorithm are used in this study to cluster similar phone units to reduce the need of the training data for model construction. the experimental results show that the proposed data-driven approach outperforms other traditional phone set construction m

In [63]:
model_inputArr[0]

'<TITLE> phone set construction based on context-sensitive articulatory attributes for code-switching speech recognition. <H> data-driven approach <R> USED-FOR <T> phone set construction <H> cross-lingual context-sensitive articulatory features <R> USED-FOR <T> data-driven approach <H> data-driven approach <R> COMPARE <T> phone set construction methods <H> acoustic features <R> PART-OF <T> distance measure <H> acoustic features <R> USED-FOR <T> data-driven approach <H> large-scale code-switching speech database <R> USED-FOR <T> model training <H> acoustic features <R> CONJUNCTION <T> cross-lingual context-sensitive articulatory features translate from Graph to Text:</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [64]:
data_ = {
    'input' : model_inputArr,
    'model output' : model_outputArr,
    'target output' : target_outputArr
}
results_dataframe = pd.DataFrame(data = data_)

ValueError: ignored

In [49]:
results_dataframe.head()

Unnamed: 0,input,model output,target output
0,<TITLE> blind source separation in a distribut...,.Unsere Informationen Informationen.Unsere Inf...,"from an audio perspective, the present state o..."
1,<TITLE> learning to estimate human pose with d...,<extra_id_0>-<extra_id_1> <extra_id_2><extra_i...,we propose a statistical formulation for 2-d h...
2,<TITLE> localization of multiple sound sources...,Graph to Text: Graph to Text: Graph to Text: G...,accurate localization of multiple sound source...
3,<TITLE> a new class of lifting wavelet transfo...,.Unsere Graph Graph Graph Graph Graph Graph Gr...,this paper proposes a new class of lifting wav...
4,<TITLE> mining wikipedia revision histories fo...,Unsere wichtigsten wissenschaftlichen Veröffen...,a well-recognized limitation of research on su...


In [None]:
input_list = results_dataframe['input'].tolist()

In [None]:
input_list[1]

'this paper considers the problem of computing placement of points in 3 dimensional space given two uncalibrated perspective views. the main theorem shows that the placement of the points is determined only up to an arbitrary projective transformation of 3-space. given additional ground control points, however, the location of the points and the camera parameters may be determined. the method is linear and non-iterative whereas previously known methods for solving the camera calibration and placement to take proper account of both ground-control points and image correspondences are unsatisfactory in requiring either iterative methods or model restrictions. as a result of the main theorem, it is possible to determine projective invariants of 3-d geometric configurations from two perspective views. translate from Graph to Text:</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [None]:
source_arr[1]

'<TITLE> prior-free and prior-dependent regret bounds for thompson sampling . <H> distribution-free and distribution-dependent bounds <R> USED-FOR <T> non-bayesian stochastic bandit <H> reward distributions <R> FEATURE-OF <T> stochastic multi-armed bandit problem\n'

In [None]:
real_target = list(results_dataframe["target output"])
generated_target = list(results_dataframe["model output"])

In [None]:
real_target[3]

'abstraction heuristics for symbolic bidirectional search. heuristics USED-FOR T> bidi-rectional search partial and perimeter abstractions USED-FOR T> bidirectional search abstraction heuristics USED-FOR T> symbolic bidirectional search'

In [None]:
generated_target[3]

'abstraction heuristics in symbolic bidirectional search. abstraction heuristics USED-FOR T> symbolic bidirectional search abstraction heuristics USED-FOR T> symbolic bidirectional search abstraction heuristics USED-FOR T> symbolic bidirectional search abstraction heuristics USED-FOR T> symbolic bidirectional search abstract state spaces USED-FOR T> bidirectional search'

In [None]:
results_dataframe.to_csv("/content/drive/My Drive/comp_sem_group_project/generated_results.csv")