## Sentence Completion Task

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
import random

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small", device_map={"":0})
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", device_map={"":0})

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Custom dataset class that tokenizes the data
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data_addr, tokenizer, max_input_length=128, max_target_length=128, mask_prob=0.3):
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        self.mask_prob = mask_prob

        df_frame = pd.read_csv(data_addr)
        input_list = []
        for i in range(df_frame.shape[0]):
            input_temp = ('Type:' + df_frame.iloc[i,0] + ' Series:' + str(df_frame.iloc[i,1]) + ' Color:' +  df_frame.iloc[i,2] + ' Buff:' +  
                    str(df_frame.iloc[i,3]) + ' Material:' + df_frame.iloc[i,4] + ' Force:' +  str(df_frame.iloc[i,5]) + ' Range:' + str(df_frame.iloc[i,6])) + ' Attachment:' + str(df_frame.iloc[i,7])

            input_list.append(input_temp)

        self.dataset = {
            'input': input_list,
            }

    def __len__(self):
        return len(self.dataset['input'])

    def __getitem__(self, idx):
        input_text = self.dataset['input'][idx]
        target_text = self.dataset['input'][idx]

        if random.random() < self.mask_prob:
            input_text = self.mask_input(input_text)

        inputs = self.tokenizer(
            input_text, 
            max_length=self.max_input_length, 
            truncation=True, 
            padding="max_length",
            return_tensors="pt"
        )

        # Tokenize inputs and targets
        targets = self.tokenizer(
            target_text, 
            max_length=self.max_target_length, 
            truncation=True, 
            padding="max_length",
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }
    
    def mask_input(self, input_text, mask_no=4):

        parsed_input = input_text.split(' ')
        parsed_input = [i.split(':') for i in parsed_input]

        ## randomly choose from the list
        idx_choices = list(range(len(parsed_input)))

        ## mask the value
        for i in range(mask_no):
            idx = idx_choices.pop(random.choice(range(len(idx_choices))))
            parsed_input[idx][1] = '<extra_id_' + str(i) + '>'

        ## return to the input format
        new_input = ''
        for i in range(len(parsed_input)):
            new_input = new_input + parsed_input[i][0] + ':' + parsed_input[i][1]
            if i < len(parsed_input)-1:
                new_input = new_input + ' '

        return new_input

# Define the custom data collator that applies dynamic augmentation
class DataCollator(DataCollatorForSeq2Seq):
    def __init__(self, tokenizer, model):
        super().__init__(tokenizer, model)

    
    def __call__(self, features):
        # Convert list of dicts to dict of tensors
        batch = super().__call__(features)
        
        return batch

# Prepare the custom dataset
dataset = CustomDataset('data-item-only.csv', tokenizer, 128, 128, mask_prob=.5)

# Prepare the augmenting data collator
data_collator = DataCollator(tokenizer, model)

In [None]:
## training arguments

training_args = TrainingArguments(
            output_dir='./results/exp',
            num_train_epochs=50,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            learning_rate= 5e-6
)

In [4]:
## init trainer

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
        tokenizer=tokenizer

)

trainer.args._n_gpu = 1
trainer.train()

  trainer = Trainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.0288
1000,0.0271
1500,0.0275


TrainOutput(global_step=1700, training_loss=0.027844177694881664, metrics={'train_runtime': 93.469, 'train_samples_per_second': 141.758, 'train_steps_per_second': 18.188, 'total_flos': 448319717376000.0, 'train_loss': 0.027844177694881664, 'epoch': 50.0})

In [5]:
## save the model weights

model.save_pretrained('./weights/exp/t5_sc_model')
tokenizer.save_pretrained('./weights/exp/t5_sc_tokenizer')

('./weights/exp/t5_sc_tokenizer/tokenizer_config.json',
 './weights/exp/t5_sc_tokenizer/special_tokens_map.json',
 './weights/exp/t5_sc_tokenizer/spiece.model',
 './weights/exp/t5_sc_tokenizer/added_tokens.json')

In [None]:
## OPTIONAL: Load weights 

tokenizer = T5Tokenizer.from_pretrained('./weights/exp/t5_sc_tokenizer', device_map={"":0})
model = T5ForConditionalGeneration.from_pretrained('./weights/exp/t5_sc_model', device_map={"":0})

In [6]:
## inference function

def generate_completion(input_text, temperature=1):
    input_ids= tokenizer(input_text, return_tensors='pt').input_ids

    input_ids = input_ids.to('cuda:0')
    outputs = model.generate(input_ids, max_length=128, temperature=temperature, do_sample=True)#, num_beams=5)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
## Test model for missing information completion

input_text = 'Type:<extra_id_0> Series:Mach Color:<extra_id_4> Buff:x2 Material:<extra_id_6> Force:10 Range:40 Attachment:1'

output = []
for i, temp in enumerate([0.5, 1, 1.25]):
    output.append(generate_completion(input_text, temperature=temp)) 
    print('Temp: ', temp)
    print('Input: ', input_text)
    print('Generated Completion: ', output[i])
    print('\n')

Temp:  0.5
Input:  Type:<extra_id_0> Series:Mach Color:<extra_id_4> Buff:x2 Material:<extra_id_6> Force:10 Range:40 Attachment:1
Generated Completion:  Type:Dagger Series:Mach Color:White Buff:x2 Material:Iron Force:10 Range:40 Attachment:1


Temp:  1
Input:  Type:<extra_id_0> Series:Mach Color:<extra_id_4> Buff:x2 Material:<extra_id_6> Force:10 Range:40 Attachment:1
Generated Completion:  Type:Longsword Series:Mach Color:White Buff:x2 Material:Steel Force:10 Range:40 Attachment:1


Temp:  1.25
Input:  Type:<extra_id_0> Series:Mach Color:<extra_id_4> Buff:x2 Material:<extra_id_6> Force:10 Range:40 Attachment:1
Generated Completion:  Type:Spear Series:Mach Color:Black Buff:x2 Material:Mach Color:White Buff:x2 Material:Illust Force:10 Range:40 Attachment:1


