In [None]:
!tar -xf input.zip
!tar -xf inference_data.zip

In [None]:
import torch
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

MODEL = 't5-small'
BATCH_SIZE = 48
NUM_PROCS = 16
EPOCHS = 10
OUT_DIR = 'results_t5small'
MAX_LENGTH = 256

dataset_train = load_dataset(
    'csv', 
    data_files='input/train.csv',
    split='train',
    nrows=20000
)
dataset_valid = load_dataset(
    'csv', 
    data_files='input/valid.csv',
    split='train',
    nrows=5000
)
print(len(dataset_train))
print(len(dataset_valid))

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)
def preprocess_function(examples):
    inputs = [f"assign tag: {title} {body}" for (title, body) in zip(examples['Title'], examples['Body'])]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
    cleaned_tag = [' '.join(''.join(tag.split('<')).split('>')[:-1]) for tag in examples['Tags']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            cleaned_tag,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )
 
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

In [None]:
trainable_params=[

    'decoder.block.1.layer.2.DenseReluDense.wi.weight',
    'decoder.block.1.layer.2.DenseReluDense.wo.weight',
    'decoder.block.1.layer.2.layer_norm.weight',

    'decoder.block.2.layer.2.DenseReluDense.wi.weight',
    'decoder.block.2.layer.2.DenseReluDense.wo.weight',
    'decoder.block.2.layer.2.layer_norm.weight',
    
    'decoder.block.3.layer.2.DenseReluDense.wi.weight',
    'decoder.block.3.layer.2.DenseReluDense.wo.weight',
    'decoder.block.3.layer.2.layer_norm.weight',
   
    'decoder.block.4.layer.2.DenseReluDense.wi.weight',
    'decoder.block.4.layer.2.DenseReluDense.wo.weight',
    'decoder.block.4.layer.2.layer_norm.weight',
    
    'decoder.block.5.layer.0.SelfAttention.q.weight',
    'decoder.block.5.layer.0.SelfAttention.k.weight',
    'decoder.block.5.layer.0.SelfAttention.v.weight',
    'decoder.block.5.layer.0.SelfAttention.o.weight',
    'decoder.block.5.layer.0.layer_norm.weight',
    'decoder.block.5.layer.2.DenseReluDense.wi.weight',
    'decoder.block.5.layer.2.DenseReluDense.wo.weight',
    'decoder.block.5.layer.2.layer_norm.weight',
    'decoder.final_layer_norm.weight'
]

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
for name, param in model.named_parameters():
    if name not in trainable_params: # choose whatever you like here
        param.requires_grad = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device is {device}")
model.to(device)
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")


In [None]:
training_args=TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    save_total_limit=5,
    report_to='tensorboard',
    learning_rate=0.0001,
    fp16=True,
    dataloader_num_workers=4
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid
)

In [None]:
history = trainer.train()

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = './results_t5small/checkpoint-4000'
model = T5ForConditionalGeneration.from_pretrained(model_path)
#tokenizer = T5Tokenizer.from_pretrained('results_t5small')

In [None]:
def do_correction(text, model, tokenizer):
    input_text = f"assign tag: {text}"
    inputs= tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=256,
        padding='max_length',
        truncation=True
    )
    corrected_ids = model.generate(
        inputs,
        max_length=256,
        num_beams=5, # `num_beams=1` indicated temperature sampling.
        early_stopping=True
    )
    corrected_sentence = tokenizer.decode(
        corrected_ids[0],
        skip_special_tokens=True
    )
    return corrected_sentence

In [None]:
import os
for file in os.listdir('inference_data/'):
    f = open(f"inference_data/{file}", 'r')
    sentence = f.read()
    corrected_sentence = do_correction(sentence, model, tokenizer)
    print(f"QUERY: {sentence}\nTAGS: {corrected_sentence}")
    print('-'*80)