## Combination: Question Answering and Sentence Completion Task

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
import random

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small", device_map={"":0})
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", device_map={"":0})

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Custom dataset class that tokenizes the data
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data_addr_item, data_addr_comp, tokenizer, max_input_length=128, max_target_length=128, mask_prob=0.3):
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        self.mask_prob = mask_prob

        df_item = pd.read_csv(data_addr_item)
        input_list_item = []
        for i in range(df_item.shape[0]):
            input_temp = ('Type:' + df_item.iloc[i,0] + ' Series:' + str(df_item.iloc[i,1]) + ' Color:' +  df_item.iloc[i,2] + ' Buff:' +  
                    str(df_item.iloc[i,3]) + ' Material:' + df_item.iloc[i,4] + ' Force:' +  str(df_item.iloc[i,5]) + ' Range:' + str(df_item.iloc[i,6])) + ' Attachment:' + str(df_item.iloc[i,7])

            input_list_item.append(input_temp)


        df_comp = pd.read_csv(data_addr_comp)
        input_list_comp, target_list_comp = [], []
        for i in range(df_comp.shape[0]):
            input_temp = ('Type:' + df_comp.iloc[i,0] + ' Series:' + str(df_comp.iloc[i,1]) + ' Color:' +  df_comp.iloc[i,2] + ' Buff:' +  
                    str(df_comp.iloc[i,3]) + ' Material:' + df_comp.iloc[i,4] + ' Force:' +  str(df_comp.iloc[i,5]) + ' Range:' + str(df_comp.iloc[i,6])) + ' Attachment:' + str(df_comp.iloc[i,7])
            target_temp = ('Type:' + df_comp.iloc[i,8] + ' Series_Comp:' + str(df_comp.iloc[i,9]) + ' Force_Comp:' + 
                        str(df_comp.iloc[i,10]) + ' Range_Comp:' + str(df_comp.iloc[i,11]) + ' Source:' + df_comp.iloc[i,12])

            input_list_comp.append(input_temp)
            target_list_comp.append(target_temp)

        self.dataset = {
            'input_item': input_list_item,
            'input_comp': input_list_comp,
            'target_comp': target_list_comp,
            }

    def __len__(self):
        return len(self.dataset['input_item']) + len(self.dataset['input_comp'])

    def __getitem__(self, idx):
        ## determine if the dataset will come from task1 or task2 through the idx
        ## if the idx is more than the len of self.dataset['input_item'] then it will go to task2
        
        if idx < len(self.dataset['input_item']):
            ## Task 1

            input_text = self.dataset['input_item'][idx]
            target_text = self.dataset['input_item'][idx]

            if random.random() < self.mask_prob:
                input_text = self.mask_input(input_text)

            ## add the task prefix

            ## complete item
            input_text = 'complete_item: ' + input_text

            inputs = self.tokenizer(
                input_text, 
                max_length=self.max_input_length, 
                truncation=True, 
                padding="max_length",
                return_tensors="pt"
            )

            # Tokenize inputs and targets
            targets = self.tokenizer(
                target_text, 
                max_length=self.max_target_length, 
                truncation=True, 
                padding="max_length",
                return_tensors="pt"
            )

        else:
            ## Task 2
            
            ## adjust the index to go back to zero
            new_idx = idx - len(self.dataset['input_item'])

            input_text = self.dataset['input_comp'][new_idx]
            target_text = self.dataset['target_comp'][new_idx]

            ## generate corresponding element 
            input_text = 'generate_element: ' + input_text

            inputs = self.tokenizer(
                input_text, 
                max_length=self.max_input_length, 
                truncation=True, 
                padding="max_length",
                return_tensors="pt"
            )

            # Tokenize inputs and targets
            targets = self.tokenizer(
                target_text, 
                max_length=self.max_target_length, 
                truncation=True, 
                padding="max_length",
                return_tensors="pt"
            )



        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }
    
    def mask_input(self, input_text, mask_no=4):

        parsed_input = input_text.split(' ')
        parsed_input = [i.split(':') for i in parsed_input]

        ## randomly choose from the list
        idx_choices = list(range(len(parsed_input)))

        ## mask the value
        for i in range(mask_no):
            idx = idx_choices.pop(random.choice(range(len(idx_choices))))
            parsed_input[idx][1] = '<extra_id_' + str(i) + '>'

        ## return to the input format
        new_input = ''
        for i in range(len(parsed_input)):
            new_input = new_input + parsed_input[i][0] + ':' + parsed_input[i][1]
            if i < len(parsed_input)-1:
                new_input = new_input + ' '

        return new_input

# Define the custom data collator that applies dynamic augmentation
class DataCollator(DataCollatorForSeq2Seq):
    def __init__(self, tokenizer, model):
        super().__init__(tokenizer, model)

    
    def __call__(self, features):
        # Convert list of dicts to dict of tensors
        batch = super().__call__(features)
        
        return batch

# Prepare the custom dataset
dataset = CustomDataset('data-item-only.csv', 'data.csv', tokenizer, 128, 128, mask_prob=.5)

# Prepare the augmenting data collator
data_collator = DataCollator(tokenizer, model)

In [None]:
## training arguments

training_args = TrainingArguments(
            output_dir='./results/exp',
            num_train_epochs=10,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            learning_rate= 5e-7
)

In [4]:
## init trainer

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
        tokenizer=tokenizer

)

trainer.args._n_gpu = 1
trainer.train()

  trainer = Trainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.03
1000,0.0284
1500,0.029
2000,0.0308


TrainOutput(global_step=2140, training_loss=0.029755637356054004, metrics={'train_runtime': 92.1601, 'train_samples_per_second': 185.764, 'train_steps_per_second': 23.22, 'total_flos': 579262910300160.0, 'train_loss': 0.029755637356054004, 'epoch': 10.0})

In [5]:
## save the model weights

model.save_pretrained('./weights/exp/t5_qa_sc_model')
tokenizer.save_pretrained('./weights/exp/t5_qa_sc_tokenizer')

('./weights/exp/t5_qa_sc_tokenizer/tokenizer_config.json',
 './weights/exp/t5_qa_sc_tokenizer/special_tokens_map.json',
 './weights/exp/t5_qa_sc_tokenizer/spiece.model',
 './weights/exp/t5_qa_sc_tokenizer/added_tokens.json')

In [6]:
## OPTIONAL: Load weights 

tokenizer = T5Tokenizer.from_pretrained('./weights/exp/t5_qa_sc_tokenizer', device_map={"":0})
model = T5ForConditionalGeneration.from_pretrained('./weights/exp/t5_qa_sc_model', device_map={"":0})

In [7]:
## inference function

def generate_pairs(input_text, task_prefix, temperature=1):
    input_ids= tokenizer(task_prefix + input_text, return_tensors='pt').input_ids

    input_ids = input_ids.to('cuda:0')
    outputs = model.generate(input_ids, max_length=128, temperature=temperature, do_sample=True)#, num_beams=5)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
## Task 1
## Complete the item information by replacing the placeholder <extra_id_#> with an existing value

input_text = 'Type:<extra_id_0> Series:Mach Color:<extra_id_1> Buff:x2 Material:<extra_id_2> Force:10 Range:40 Attachment:1'

task_prefix = "complete_item: "
output_item = []
for i, temp in enumerate([0.5, 0.6, .7]):
    output_item.append(generate_pairs(input_text, task_prefix, temperature=temp)) 
    print('Temp: ', temp)
    print('Input: ', input_text)
    print('Generated Completion: ', output_item[i])
    print('\n')

Temp:  0.5
Input:  Type:<extra_id_0> Series:Mach Color:<extra_id_4> Buff:x2 Material:<extra_id_6> Force:10 Range:40 Attachment:1
Generated Completion:  Type:Longsword Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1


Temp:  0.6
Input:  Type:<extra_id_0> Series:Mach Color:<extra_id_4> Buff:x2 Material:<extra_id_6> Force:10 Range:40 Attachment:1
Generated Completion:  Type:Dagger Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1


Temp:  0.7
Input:  Type:<extra_id_0> Series:Mach Color:<extra_id_4> Buff:x2 Material:<extra_id_6> Force:10 Range:40 Attachment:1
Generated Completion:  Type:Dagger Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1




In [None]:
## Task 2 
## Use the completed item to search for its corresponding element

task_prefix = "generate_element: "
output_comp = []

for j in output_item:
    input_text = j
    print('Candidate Item: ' + j)
    for i, temp in enumerate([0.5, .75, 1]):
        output_comp.append(generate_pairs(input_text, task_prefix, temperature=temp)) 
        print('Temp: ', temp)
        print('Item Input: ', input_text)
        print('Generated Complement: ', output_comp[i])
        print('\n')
    print('\n')

Candidate Item: Type:Longsword Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1
Temp:  0.5
Item Input:  Type:Longsword Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1
Generated Complement:  Type:Fire Series_Comp:Mach Force_Comp:20 Range_Comp:40 Source:Steampunk


Temp:  0.75
Item Input:  Type:Longsword Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1
Generated Complement:  Type:Electric Series_Comp:Mach Force_Comp:20 Range_Comp:40 Source:Traditional


Temp:  1
Item Input:  Type:Longsword Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1
Generated Complement:  Type:Electric Series_Comp:Mach Force_Comp:40 Range_Comp:40 Source:Traditional




Candidate Item: Type:Dagger Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1
Temp:  0.5
Item Input:  Type:Dagger Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1
Generated Comple