# Importing necessary libraries and selecting the  model to be used

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_model, PrefixTuningConfig, TaskType
from torch.utils.data import DataLoader
from tqdm import tqdm
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

device  = 'cuda'

model_name_or_path = "google/flan-t5-large"
tokenizer_name_or_path = "google/flan-t5-large"


# Importing Dataset

In [2]:
import pandas as pd

df = pd.read_csv("/kaggle/input/financial-context-dataset/Financial_Context_Dataset.csv")

df.drop_duplicates(subset = 'Query', keep='first',inplace=True)

## Shuffling Dataset rows

In [3]:
df = df.sample(frac = 1)
df.reset_index(inplace=True,drop=True)

## Keeping only relevant columns for training

In [4]:
df = df[['Query','Required Data']]

In [5]:
df

Unnamed: 0,Query,Required Data
0,What are the top 4 companies who have earned t...,Companies: General Companies\nInformation: Ret...
1,"Among Communication Services companies, which ...",Companies: Communication Services Companies\nI...
2,How does Food Products companies and unum grou...,Companies: Food Products companies; Unum Group...
3,What transformations are apparent in the finan...,Companies: Omnicom Group\nInformation: Financi...
4,What are the gains due to reversal of prior-ye...,Companies: Semiconductors & Semiconductor Equi...
...,...,...
52810,"Over the recent 19 days, how does the Stockhol...",Companies: Cadence Design Systems\nInformation...
52811,What are the Software & Services companies who...,Companies: Electronic Arts; Lockheed Martin Co...
52812,What are the 10 companies in the fast-food ind...,Companies: Application Software companies; pot...
52813,Did Charles River's roae experience small impr...,Companies: Charles River Laboratories\nInforma...


## Appending the task to be performed on the input as a prefix

In [6]:
Dataset_length = len(df.axes[0])

query_prefix = 'Analyse the following query and provide the names of the companies, the relevant information and their related concepts, and the dates for which the information is required:\nQuery: '

for i in range(Dataset_length):
  df.loc[i, 'Query'] = query_prefix + df.loc[i, 'Query'] + '\n'

In [7]:
df

Unnamed: 0,Query,Required Data
0,Analyse the following query and provide the na...,Companies: General Companies\nInformation: Ret...
1,Analyse the following query and provide the na...,Companies: Communication Services Companies\nI...
2,Analyse the following query and provide the na...,Companies: Food Products companies; Unum Group...
3,Analyse the following query and provide the na...,Companies: Omnicom Group\nInformation: Financi...
4,Analyse the following query and provide the na...,Companies: Semiconductors & Semiconductor Equi...
...,...,...
52810,Analyse the following query and provide the na...,Companies: Cadence Design Systems\nInformation...
52811,Analyse the following query and provide the na...,Companies: Electronic Arts; Lockheed Martin Co...
52812,Analyse the following query and provide the na...,Companies: Application Software companies; pot...
52813,Analyse the following query and provide the na...,Companies: Charles River Laboratories\nInforma...


# Preparing Dataset for Fine Tuning

In [8]:
from datasets import Dataset

In [9]:
dataset = Dataset.from_pandas(df)

In [10]:
dataset

Dataset({
    features: ['Query', 'Required Data'],
    num_rows: 52815
})

## Splitting Dataset into Training and Validation sets

In [11]:
train_dataset, validation_dataset= dataset.train_test_split(test_size=0.2).values()

In [12]:
from datasets.dataset_dict import DatasetDict
dataset = DatasetDict({'train': train_dataset, 'validation': validation_dataset})

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Query', 'Required Data'],
        num_rows: 42252
    })
    validation: Dataset({
        features: ['Query', 'Required Data'],
        num_rows: 10563
    })
})

In [14]:
dataset["train"][1578]

{'Query': "Analyse the following query and provide the names of the companies, the relevant information and their related concepts, and the dates for which the information is required:\nQuery: How has the variation of the service revenue, Debt Service Coverage Ratio, Accounts Payable Turnover Ratio, quick ratio of tyson foods's competitors been in the past 6 weeks and how does it compare to P&G?\n",
 'Required Data': 'Companies: Tyson Foods Peers; Procter & Gamble\nInformation: Service Revenue (Related Concepts: Total Revenue); Debt Service Coverage Ratio (Related Concepts: Earnings Before Interest, Taxes, Depreciation, and Amortization (EBITDA), Total Debt Service); Accounts Payable Turnover Ratio (Related Concepts: Total Supplier Purchases, Average Accounts Payable); Quick Ratio (Related Concepts: Cash, Cash Equivalents, Marketable Securities, Accounts Receivable, Current Liabilities)\nDates: in the past 6 weeks'}

## Tokenizing the dataset

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
max_input_length = 256
max_output_length = 512

text_column = "Query"
label_column = "Required Data"

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=max_output_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [16]:
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset (num_proc=1):   0%|          | 0/42252 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=1):   0%|          | 0/10563 [00:00<?, ? examples/s]

In [17]:
processed_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 42252
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10563
    })
})

## Preparing the DataLoader

In [18]:
batch_size = 2
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

train_dataloader = DataLoader(
    train_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

## Preparing the PEFT Config of the model for Prefix tuning setup

In [19]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=100)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 4,915,200 || all params: 788,065,280 || trainable%: 0.6237


## Specifying Training Parameters

In [20]:
lr = 2e-2
num_epochs = 5
import torch
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

# Start Training

In [None]:
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 21126/21126 [4:56:28<00:00,  1.19it/s]  
100%|██████████| 5282/5282 [42:57<00:00,  2.05it/s]


epoch=0: train_ppl=tensor(1.1835, device='cuda:0') train_epoch_loss=tensor(0.1685, device='cuda:0') eval_ppl=tensor(1.0367, device='cuda:0') eval_epoch_loss=tensor(0.0361, device='cuda:0')


100%|██████████| 21126/21126 [4:56:16<00:00,  1.19it/s]  
100%|██████████| 5282/5282 [42:56<00:00,  2.05it/s]


epoch=1: train_ppl=tensor(1.0928, device='cuda:0') train_epoch_loss=tensor(0.0887, device='cuda:0') eval_ppl=tensor(1.0287, device='cuda:0') eval_epoch_loss=tensor(0.0283, device='cuda:0')


100%|██████████| 21126/21126 [4:56:19<00:00,  1.19it/s]  
100%|██████████| 5282/5282 [42:56<00:00,  2.05it/s]


epoch=2: train_ppl=tensor(1.0726, device='cuda:0') train_epoch_loss=tensor(0.0701, device='cuda:0') eval_ppl=tensor(1.0173, device='cuda:0') eval_epoch_loss=tensor(0.0172, device='cuda:0')


100%|██████████| 21126/21126 [4:56:47<00:00,  1.19it/s]  
100%|██████████| 5282/5282 [42:56<00:00,  2.05it/s]


epoch=3: train_ppl=tensor(1.0550, device='cuda:0') train_epoch_loss=tensor(0.0535, device='cuda:0') eval_ppl=tensor(1.0118, device='cuda:0') eval_epoch_loss=tensor(0.0118, device='cuda:0')


100%|██████████| 21126/21126 [4:56:36<00:00,  1.19it/s]  
100%|██████████| 5282/5282 [42:56<00:00,  2.05it/s]

epoch=4: train_ppl=tensor(1.0382, device='cuda:0') train_epoch_loss=tensor(0.0375, device='cuda:0') eval_ppl=tensor(1.0076, device='cuda:0') eval_epoch_loss=tensor(0.0076, device='cuda:0')





## Testing model otput on a sample query

In [None]:
query = 'What was the total debt of Amazon in 2021?'


input = 'Analyse the following query and provide the names of the companies, the relevant information and their related concepts, and the dates for which the information is required:\nQuery: ' + query + '\n'

inputs = tokenizer(" " + input,return_tensors="pt")

In [None]:
model.to(device)

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=512)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))


['Companies: Amazon.com Inc. Information: Debt (Related Concepts: Current Liability) Dates: in 2021']
