## Question Answering Task

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

## get the t5 pretrained model from huggingface

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small", device_map={"":0})
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", device_map={"":0})

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
## Create a dataloader that will divide the 'sentences' in the dataset into question and answer pairs

def load_data(data):
    df = pd.read_csv(data)
    input_list, target_list = [], []
    for i in range(df.shape[0]):
        ## question
        input_temp = ('Type:' + df.iloc[i,0] + ' Series:' + str(df.iloc[i,1]) + ' Color:' +  df.iloc[i,2] + ' Buff:' +  
                  str(df.iloc[i,3]) + ' Material:' + df.iloc[i,4] + ' Force:' +  str(df.iloc[i,5]) + ' Range:' + str(df.iloc[i,6])) + ' Attachment:' + str(df.iloc[i,7])
        
        ## answer
        target_temp = ('Element:' + df.iloc[i,8] + ' Series_Comp:' + str(df.iloc[i,9]) + ' Force_Comp:' + 
                    str(df.iloc[i,10]) + ' Range_Comp:' + str(df.iloc[i,11]) + ' Source:' + df.iloc[i,12])

        input_list.append(input_temp)
        target_list.append(target_temp)

    return {
        'input_text': input_list,
        'target_text': target_list
    }

## load the data
data = load_data('data.csv')
dataset = Dataset.from_dict(data)

In [3]:
## Create a tokenizer for the sentence input

def preprocess(examples):
    inputs = ['generate_caps: ' + text for text in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], max_length=128, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

Map: 100%|██████████| 1447/1447 [00:00<00:00, 5763.31 examples/s]


In [4]:
## Set up the training arguments

training_args = TrainingArguments(
            output_dir='./results/exp',
            num_train_epochs=15,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.001,
            logging_dir='./logs',
)

In [5]:
## Initialize the trainer

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,

)

## Start training
trainer.args._n_gpu = 1
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,2.4815
1000,0.0335
1500,0.027
2000,0.0244
2500,0.0231


TrainOutput(global_step=2715, training_loss=0.4786847386790464, metrics={'train_runtime': 94.685, 'train_samples_per_second': 229.234, 'train_steps_per_second': 28.674, 'total_flos': 734398450237440.0, 'train_loss': 0.4786847386790464, 'epoch': 15.0})

In [8]:
## OPTIONAL: Save the model weights

model.save_pretrained('./weights/exp/t5_qa_model')
tokenizer.save_pretrained('./weights/exp/t5_qa_tokenizer')

('./weights/exp/t5_qa_tokenizer/tokenizer_config.json',
 './weights/exp/t5_qa_tokenizer/special_tokens_map.json',
 './weights/exp/t5_qa_tokenizer/spiece.model',
 './weights/exp/t5_qa_tokenizer/added_tokens.json')

In [None]:
## OPTIONAL: Load weights 

tokenizer = T5Tokenizer.from_pretrained('./weights/exp/t5_qa_tokenizer', device_map={"":0})
model = T5ForConditionalGeneration.from_pretrained('./weights/exp/t5_qa_model', device_map={"":0})

In [6]:
## Inference function

def generate_pairs(input_text,temperature=1):
    input_ids= tokenizer("generate_caps: " + input_text, return_tensors='pt').input_ids

    input_ids = input_ids.to('cuda:0')
    outputs = model.generate(input_ids, max_length=128, temperature=temperature, do_sample=True)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [7]:
## Test Model

input_text = 'Type:Dagger Series:Mach Color:White Buff:x3 Material:Iron Force:10 Range:20 Attachment:1'

for temp in [0.5, 1, 1.5]:
    output = generate_pairs(input_text, temperature=temp) 
    print('Temp: ', temp)
    print('Item Input: ', input_text)
    print('Generated Complement: ', output)
    print('\n')

Temp:  0.5
Item Input:  Type:Dagger Series:Mach Color:White Buff:x3 Material:Iron Force:10 Range:20 Attachment:1
Generated Complement:  Element:Bolt Series_Comp:Mach Force_Comp:20 Range_Comp:20 Source:Traditional


Temp:  1
Item Input:  Type:Dagger Series:Mach Color:White Buff:x3 Material:Iron Force:10 Range:20 Attachment:1
Generated Complement:  Element:Fire Series_Comp:Mach Force_Comp:20 Range_Comp:20 Source:Steampunk


Temp:  1.5
Item Input:  Type:Dagger Series:Mach Color:White Buff:x3 Material:Iron Force:10 Range:20 Attachment:1
Generated Complement:  Element:Bolt Series_Comp:Mach Force_Comp:20 Range_Comamp:20 Source:Steampunk


