# Upload dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd /content/drive/My Drive/comp_sem_group_project/agenda_preprocess/processed

/content/drive/My Drive/comp_sem_group_project/agenda_preprocess/processed


In [3]:
import pandas as pd

In [4]:
with open("training-src.txt", "r") as file:
  train_source = file.readlines()

In [5]:
file.close()

In [6]:
with open("training-tgt.txt", "r") as file:
  train_target = file.readlines()

In [7]:
file.close()

In [8]:
with open("test-src.txt", "r") as file:
  test_source = file.readlines()

In [9]:
file.close()

In [10]:
with open("test-tgt.txt", "r") as file:
  test_target = file.readlines()

In [11]:
file.close()

In [12]:
with open("dev-src.txt", "r") as file:
  val_source = file.readlines()

In [13]:
file.close()

In [14]:
with open("dev-tgt.txt", "r") as file:
  val_target = file.readlines()

In [15]:
file.close()

# Reformat the dataset

In [16]:
train_df = pd.DataFrame(list(zip(train_source, train_target)),
                        columns=['source', 'target'])

In [17]:
train_df.head()

Unnamed: 0,source,target
0,<TITLE> constrained minimization technique for...,this paper describes the constrained minimizat...
1,"<TITLE> signfinder : using color to detect , l...","we describe an approach to detecting , locatin..."
2,"<TITLE> joint estimation of motion , structure...",we present a novel variational method for the ...
3,<TITLE> hierarchical language identification b...,due to the limitation of single-level classifi...
4,<TITLE> differentially private m-estimators . ...,this paper studies privacy preserving m-estima...


In [18]:
val_df = pd.DataFrame(list(zip(val_source, val_target)),
                        columns=['source', 'target'])

test_df = pd.DataFrame(list(zip(test_source, test_target)),
                        columns=['source', 'target'])

In [39]:
test_df.head()

Unnamed: 0,source,target
0,<TITLE> hierarchical semantic classification :...,we present a learning architecture for lexical...
1,<TITLE> prior-free and prior-dependent regret ...,we consider the stochastic multi-armed bandit ...
2,<TITLE> speech processing and retrieval in a p...,the paper presents a new application of automa...
3,<TITLE> estimating speech recognition error ra...,we address the problem of estimating the word ...
4,<TITLE> product of power spectrum and group de...,mel-frequency cepstral coefficients -lrb- mel-...


# Import stuff

In [19]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import pandas as pd
import numpy as np
import matplotlib as plt
import re
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration
from torch.utils.data import random_split, RandomSampler
from nltk.translate.bleu_score import sentence_bleu
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
import torch

In [21]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [22]:
source_arr = train_df['source']
target_arr = train_df['target']

In [23]:
class CustomDataset(Dataset):
    def __init__(self, source_arr, target_arr, tokenizer):
        self.target_arr = target_arr
        self.source_arr = source_arr
        self.tokenizer = tokenizer 
        
        def ArrayLength():
            target_arr_length = len(self.target_arr)
            if(target_arr_length == len(self.source_arr)):
                return target_arr_length
            else:
                raise Exception("Array Lengths not Equal!!!")
                
        self.arr_len = ArrayLength()
        
    def __len__(self):
        return self.arr_len
    
    def __getitem__(self, index):
        target = self.target_arr[index]
        source = self.source_arr[index]
        
        input_ = source + "translate from Graph to Text: "
        output_ = target 
        
        #Encoding our inputs
        inputs = self.tokenizer.encode_plus(input_, pad_to_max_length=True,return_attention_mask=True, max_length=250)
        #Encoding our outputs
        outputs = self.tokenizer.encode_plus(output_, pad_to_max_length=True,return_attention_mask=True, max_length=250)
        
        input_ids = inputs['input_ids']
        input_attention_masks = inputs['attention_mask']
        
        output_ids = outputs['input_ids']
        output_attention_masks = outputs['attention_mask']
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'input_mask': torch.tensor(input_attention_masks, dtype=torch.long),
            'output_ids': torch.tensor(output_ids, dtype=torch.long),
            'output_mask': torch.tensor(output_attention_masks, dtype=torch.long)
        }   

In [24]:
#!pip install datsets transformers[sentencepiece]
#!pip install sentencepiece

In [25]:
# Loading pre-trained T5-base Tokenizer of T5-base Model

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.to(device);

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
tokenizer

In [27]:
new_tokens = ['<H>', '<R>', '<TITLE>']
new_tokens_vocab = {}
new_tokens_vocab['additional_special_tokens'] = []
for idx, t in enumerate(new_tokens):
  new_tokens_vocab['additional_special_tokens'].append(t)
num_added_toks = tokenizer.add_special_tokens(new_tokens_vocab)

In [28]:
tokenizer

T5Tokenizer(name_or_path='t5-base', vocab_size=32100, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<H>', '<R>', '<TITLE>']}, clean_up_tokenization_spaces=True)

# Fine-tuning

In [29]:
dataset = CustomDataset(target_arr, source_arr, tokenizer)

trainset_ratio = 0.8

data_len = len(dataset)
print("Total Data Size: ", data_len)

training_data_size = int(data_len*trainset_ratio)
print("Training Data Size: ", training_data_size)

val_data_size = data_len - training_data_size
print("Validation Data Size: ", val_data_size)

train_dataset, validation_dataset = random_split(dataset, [training_data_size, val_data_size])

batch_size = 8
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size=batch_size)

if(trainset_ratio != 1.0):
    validation_dataloader = DataLoader(validation_dataset, sampler = RandomSampler(validation_dataset), batch_size=batch_size)

Total Data Size:  38720
Training Data Size:  30976
Validation Data Size:  7744


In [30]:
LEARNING_RATE = 2e-5

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

epochs = 2
steps2report = 20

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

for epoch in range(0,epochs):
    model.train()
    for step, data in enumerate(train_dataloader):
        optimizer.zero_grad()
        
        # Getting input parameters
        x_input_ids = data['input_ids'].to(device)
        x_masks = data['input_mask'].to(device)

        # Getting output parameters
        y_expl_ids = data['output_ids'].to(device)

        # Feeding values into our model
        outputs = model(input_ids              = x_input_ids, 
                        attention_mask         = x_masks,
                        labels                 = y_expl_ids)
        loss = outputs[0] 
        
        if step%steps2report==0:
            print("EPOCH: ", epoch, " loss.item: ", loss.item())
        
        loss.backward()
        optimizer.step() 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


EPOCH:  0  loss.item:  10.057912826538086
EPOCH:  0  loss.item:  5.44541072845459
EPOCH:  0  loss.item:  2.550626516342163
EPOCH:  0  loss.item:  3.321807622909546
EPOCH:  0  loss.item:  1.8460559844970703
EPOCH:  0  loss.item:  1.7833049297332764
EPOCH:  0  loss.item:  1.4063384532928467
EPOCH:  0  loss.item:  1.5923938751220703
EPOCH:  0  loss.item:  1.9783505201339722
EPOCH:  0  loss.item:  1.467055320739746
EPOCH:  0  loss.item:  1.1101552248001099
EPOCH:  0  loss.item:  1.0779809951782227
EPOCH:  0  loss.item:  1.4565198421478271
EPOCH:  0  loss.item:  1.1933716535568237
EPOCH:  0  loss.item:  1.003564715385437
EPOCH:  0  loss.item:  0.9285816550254822
EPOCH:  0  loss.item:  0.9034581184387207
EPOCH:  0  loss.item:  1.003572940826416
EPOCH:  0  loss.item:  0.8155319094657898
EPOCH:  0  loss.item:  1.0507314205169678
EPOCH:  0  loss.item:  0.9162723422050476
EPOCH:  0  loss.item:  1.085796594619751
EPOCH:  0  loss.item:  0.9185289740562439
EPOCH:  0  loss.item:  0.7196663618087769


KeyboardInterrupt: ignored

# Inference

In [31]:
target_arr = test_df['target']
source_arr = test_df['source']

batch_size = 8
test_dataset = CustomDataset(target_arr, source_arr, tokenizer)
test_dataloader = DataLoader(test_dataset, sampler = RandomSampler(test_dataset), batch_size=batch_size)

In [45]:
source_arr[0]

'<TITLE> hierarchical semantic classification : word sense disambiguation with world knowledge . <H> learning architecture <R> USED-FOR <T> lexical semantic classification problems\n'

In [40]:
import torch
model.eval()

model_inputArr = []
model_outputArr = []
target_outputArr = []

with torch.no_grad():
    for step, data in enumerate(test_dataloader):
        input_ids = data["input_ids"].to(device)
        input_masks = data["input_mask"].to(device)
        explanation_ids = data["output_ids"].to(device)
        explanation_masks = data["output_mask"].to(device)
        output = model.generate(input_ids = input_ids, attention_mask = input_masks, max_length=200,do_sample=False )
        
        for input_vector, output_vector in zip(input_ids, output):
            model_inputArr.append(tokenizer.decode(input_vector, skip_special_tokens=False))
            model_outputArr.append(tokenizer.decode(output_vector, skip_special_tokens=True))
        target_outputArr.append(tokenizer.batch_decode(explanation_ids, skip_special_tokens=True))    
        
target_outputArr = [item for sublist in target_outputArr for item in sublist]



In [41]:
data_ = {
    'input' : model_inputArr,
    'model output' : model_outputArr,
    'target output' : target_outputArr
}
results_dataframe = pd.DataFrame(data = data_)

In [42]:
results_dataframe.head()

Unnamed: 0,input,model output,target output
0,action recognition in videos is a challenging ...,hybrid video classification using unsupervised...,sympathy for the details : dense trajectories ...
1,this paper considers the problem of computing ...,a method for computing placement of points in ...,stereo from uncalibrated cameras. iterative me...
2,automated musical accompaniment of human perfo...,a markov model for musical accompaniment. sequ...,modeling form for on-line following of musical...
3,symbolic bidirectional uniform-cost search is ...,abstraction heuristics in symbolic bidirection...,abstraction heuristics for symbolic bidirectio...
4,we derive a recursive general-radix pruned coo...,a recursive general-radix pruned cooley-tukey ...,generating high performance pruned fft impleme...


In [43]:
input_list = results_dataframe['input'].tolist()

In [49]:
input_list[1]

'this paper considers the problem of computing placement of points in 3 dimensional space given two uncalibrated perspective views. the main theorem shows that the placement of the points is determined only up to an arbitrary projective transformation of 3-space. given additional ground control points, however, the location of the points and the camera parameters may be determined. the method is linear and non-iterative whereas previously known methods for solving the camera calibration and placement to take proper account of both ground-control points and image correspondences are unsatisfactory in requiring either iterative methods or model restrictions. as a result of the main theorem, it is possible to determine projective invariants of 3-d geometric configurations from two perspective views. translate from Graph to Text:</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [38]:
source_arr[1]

'<TITLE> prior-free and prior-dependent regret bounds for thompson sampling . <H> distribution-free and distribution-dependent bounds <R> USED-FOR <T> non-bayesian stochastic bandit <H> reward distributions <R> FEATURE-OF <T> stochastic multi-armed bandit problem\n'

In [50]:
real_target = list(results_dataframe["target output"])
generated_target = list(results_dataframe["model output"])

In [55]:
real_target[3]

'abstraction heuristics for symbolic bidirectional search. heuristics USED-FOR T> bidi-rectional search partial and perimeter abstractions USED-FOR T> bidirectional search abstraction heuristics USED-FOR T> symbolic bidirectional search'

In [56]:
generated_target[3]

'abstraction heuristics in symbolic bidirectional search. abstraction heuristics USED-FOR T> symbolic bidirectional search abstraction heuristics USED-FOR T> symbolic bidirectional search abstraction heuristics USED-FOR T> symbolic bidirectional search abstraction heuristics USED-FOR T> symbolic bidirectional search abstract state spaces USED-FOR T> bidirectional search'

In [62]:
results_dataframe.to_csv("/content/drive/My Drive/comp_sem_group_project/generated_results.csv")