In [1]:
!git clone -b MLOps https://github.com/Gift-py/ai_hackathon/


fatal: destination path 'ai_hackathon' already exists and is not an empty directory.


In [2]:
%cd ai_hackathon/

/content/ai_hackathon


In [3]:
import os
import pandas as pd


In [4]:
qaps = pd.read_csv('./qaps_html.csv')
id_url = pd.read_csv('./id_url_html.csv')

In [5]:
id_url.iloc[:,3].max()


2026606

In [6]:
summaries = pd.read_csv('./summaries_html.csv')


In [7]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
from datasets import Dataset, DatasetDict
import os
import json
import csv
import pandas as pd

qa_dataset = []
for index, row in qaps.iterrows():
    context = summaries[summaries['document_id'] == row['document_id']]['summary'].values[0]
    question = row['question']
    answer = [row['answer1'], row['answer2']]
    qa_dataset.append(
        {
            'context': context,
            'question': question,
            'answer': answer
        }
    )

In [9]:
qa_dataset[1]

{'context': " At Madeline Hall, an old mansion-house near Southampton belonging to the wealthy de Versely family, lives an elderly spinster Miss Delmar, the aunt of the earl de Versely and Captain Delmar. Miss Delmar invites Arabella Mason, the daughter of a deceased, well-liked steward to stay with her as a lower-class guest in the house. Captain Delmar is known to visit his aunt at Madeline Hall frequently, accompanied by his valet Ben Keene, who is also a private marine. Captain Delmar eventually suggests that Ben should propose to Arabella, and the two marry in secret, to the frustration of Miss Delmar and Arabella's mother. The captain is able to smooth over the situation with his aunt, even after it is discovered that Arabella was six months pregnant at the time of the marriage. She later gives birth to a boy, who takes the Captain's Christian name and Ben's surname--the titular Percival Keene.\nThe family moves to Chatham, after Ben is ordered back with his detachment. Arabella 

In [10]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

n_samples = len(qa_dataset)
train_end = int(train_ratio * n_samples)
val_end = train_end + int(val_ratio * n_samples)
train_data = Dataset.from_list(qa_dataset[:train_end])
val_data = Dataset.from_list(qa_dataset[train_end:val_end])
test_data = Dataset.from_list(qa_dataset[val_end:])


In [11]:
data_dict = DatasetDict({
    'train': train_data,
    'validation': val_data,
    'test': test_data
})

In [12]:
!pip install transformers sentencepiece


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
from transformers import T5Tokenizer
import torch
 

tokenizer = T5Tokenizer.from_pretrained('t5-base')

def tokenize_example(example):
    context, question, answers = example['context'], example['question'], example['answer']   
    input_str = f'context: {context} </s>'
    answer = ' | '.join(answers)    
    target_str = f'question: {question} answer: {answer} </s>'
    inputs = tokenizer(input_str, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    target = tokenizer(target_str, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

    input_ids = torch.tensor(inputs['input_ids'][0])
    attention_mask = torch.tensor(inputs['attention_mask'][0])
    target_input_ids = torch.tensor(target['input_ids'][0])
    target_attention_mask = torch.tensor(target['attention_mask'][0])
    
    return {'input_ids': input_ids, 'attention_mask':attention_mask, 'decoder_input_ids':target_input_ids, 'decoder_attention_mask':target_attention_mask}



# def tokenized_to_tensor(tokenized_data):
#     input_ids = torch.tensor(tokenized_data['input_ids'])
#     attention_mask = torch.tensor(tokenized_data['attention_mask'])
#     decoder_input_ids = torch.tensor(tokenized_data['target_input_ids'])
#     decoder_attention_mask = torch.tensor(tokenized_data['target_attention_mask'])

#     dataset = TensorDataset(input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)

#     return dataset

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [14]:
tokenized_dataset = data_dict.map(tokenize_example)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'])

Map:   0%|          | 0/18774 [00:00<?, ? examples/s]

  input_ids = torch.tensor(inputs['input_ids'][0])
  attention_mask = torch.tensor(inputs['attention_mask'][0])
  target_input_ids = torch.tensor(target['input_ids'][0])
  target_attention_mask = torch.tensor(target['attention_mask'][0])


Map:   0%|          | 0/2346 [00:00<?, ? examples/s]

Map:   0%|          | 0/2348 [00:00<?, ? examples/s]

In [15]:
len(tokenized_dataset['train'][0]['input_ids'])

512

In [16]:
def collate_fn(data):
    input_ids = []
    attention_mask = []
    target_ids = []
    target_attention_mask = []

    for item in data:
        input_ids.append(item["input_ids"])
        attention_mask.append(item["attention_mask"])
        target_ids.append(item["decoder_input_ids"])
        target_attention_mask.append(item["decoder_attention_mask"])


    return {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_mask),
        "decoder_input_ids": torch.stack(target_ids),
        "decoder_attention_mask": torch.stack(target_attention_mask),
    }

In [17]:
from torch.utils.data import DataLoader, Dataset

batch_size = 4

class CustomDataset(Dataset):
    def __init__(self, dataloader):
        self.dataloader = dataloader

    def __len__(self):
        return len(self.dataloader)

    def __getitem__(self, index):
        data = next(iter(self.dataloader))
        return data['input_ids'], data['attention_mask'], data['decoder_input_ids'], data['decoder_attention_mask']

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=batch_size, num_workers=4, pin_memory=True, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=batch_size, num_workers=4, pin_memory=True, collate_fn=collate_fn)


train_dataset = CustomDataset(train_dataloader)
val_dataset = CustomDataset(val_dataloader)




In [18]:
len(train_dataset)

2347

In [19]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

#load the model
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.to('cuda')

#def training args
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 3,
    learning_rate = 2e-4,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    warmup_steps = 500,
    weight_decay = 0.01,
    logging_dir = './logs',
    logging_steps = 50,
    load_best_model_at_end = True,
    metric_for_best_model = 'rouge1',
    greater_is_better = 'True',
    gradient_accumulation_steps = 4,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    save_total_limit = 10,
    eval_steps = 10,
    save_steps = 10,
)




In [20]:
class CustomTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs['decoder_input_ids']
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    decoder_attention_mask = inputs['decoder_attention_mask']

    outputs = model(
        input_ids,
        attention_mask = attention_mask,
        decoder_input_ids = labels[:, :-1],
        decoder_attention_mask = decoder_attention_mask[:, :-1],
        use_cache=False
    )

    logits = outputs.logits
    labels = labels[:, :-1]
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits.reshape(-1, logits.shape[-1]), labels.reshape(-1))

    inputs['decoder_input_ids'] = labels
    inputs['labels'] = labels
    inputs['logits'] = logits

    return (loss, inputs) if return_outputs else loss

In [21]:
def trainer_data_collator(batch):

    # input_ids = [item[0] for item in batch]
    # input_ids = torch.cat(input_ids, dim=0)
    # input_ids = input_ids.reshape(len(batch), -1)
    input_ids = batch[0][0]

    # attention_mask = [item[1] for item in batch]
    # attention_mask = torch.cat(attention_mask, dim=0)
    # attention_mask = attention_mask.reshape(len(batch), -1)
    attention_mask = batch[0][1]

    # decoder_input_ids = [item[2] for item in batch]
    # decoder_input_ids = torch.cat(decoder_input_ids, dim=0)
    # decoder_input_ids = decoder_input_ids.reshape(len(batch), -1)
    decoder_input_ids = batch[0][2]

    # decoder_attention_mask = [item[3] for item in batch]
    # decoder_attention_mask = torch.cat(decoder_attention_mask, dim=0)
    # decoder_attention_mask = decoder_attention_mask.reshape(len(batch), -1)
    decoder_attention_mask = batch[0][3]

    return {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.long(),
        'decoder_input_ids': decoder_input_ids.long(),
        'decoder_attention_mask': decoder_attention_mask.long(),
    }
    

In [22]:
#define the trainer
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = trainer_data_collator,
)


In [26]:
for i in range(100):
    import gc
    gc.collect()
    torch.cuda.empty_cache()

In [24]:
trainer.train()



OutOfMemoryError: ignored

In [None]:
trainer.save_model(output_dir='./model')

In [None]:
for i in range(100):
    import gc
    gc.collect()
    torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)