In [1]:
!pip install transformers  --quiet
!pip install sentencepiece==0.1.94 --quiet

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import torch
import pandas as pd
import numpy as np
from transformers import TrainingArguments, Trainer, AutoTokenizer, T5ForConditionalGeneration, T5Config
from transformers.optimization import Adafactor, AdafactorSchedule
from torch.utils.data import Dataset

In [4]:
### Config
MODEL = {
    'name': 't5-small',
    'data_link': "https://raw.githubusercontent.com/duong-sau/chatbot1212/master/Model/Data/IntentClassification/POS/learn_data.csv",
    'num_decoder_layers': 6,
    'num_freeze': 5
}
strategy = 'epoch'
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive",
    overwrite_output_dir=True,
    save_strategy=strategy,
    disable_tqdm=False,
    debug="underflow_overflow",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,
    evaluation_strategy='epoch',
    #logging_steps = 16,
    #eval_steps=16,
    fp16=False,
    warmup_steps=100,
    learning_rate=1e-3,
    adam_epsilon=1e-3,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=False,
)

def getOptimizer(model):
  return Adafactor(model.parameters(), lr=1e-3, relative_step=False, warmup_init=False)

def freezeLayer(model, freeze):
    for layer in model.base_model.encoder.block[:freeze]:
      for param in layer.parameters():
          param.requires_grad = False

def tokenConfig(tokenizer):
    assert tokenizer
    tokenizer.padding_side = "left"

def train_validate_test_split(df, train_percent=.8):
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.iloc[perm[:train_end]]
    test = df.iloc[perm[train_end:]]
    return train, test

In [5]:
data = pd.read_csv(MODEL['data_link'], header=0)
data = data.astype(str)

In [6]:
class myDataset(Dataset):
    def __init__(self, tokenizer, df, max_len=512):
        self.data_column = df["source"].values + '</s>'
        self.class_column = df['target'].values + '</s>'
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data_column)

    def __getitem__(self, index):
        tokenized_inputs = self.tokenizer.encode_plus(self.data_column[index], max_length=self.max_len,
                                                      padding='longest', return_tensors="pt")
        tokenized_targets = self.tokenizer.encode_plus(self.class_column[index], max_length=4, pad_to_max_length=True,
                                                       return_tensors="pt")
        source_ids = tokenized_inputs["input_ids"].squeeze()
        target_ids = tokenized_targets["input_ids"].squeeze()
        src_mask = tokenized_inputs["attention_mask"].squeeze()
        return {"input_ids": source_ids, "attention_mask": src_mask,
                "label": target_ids}


In [7]:
class StsTrainer(Trainer):
  def compute_loss(self,model,inputs,classifier):
    output = model.generate(inputs)
    text = tok.decode(output)
    # convert text to ids
    classifier_output = classifier(text)
    loss = loss_function(classifier_output, targets)
    return loss

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL['name'])
tokenConfig(tokenizer=tokenizer)
assert tokenizer

config = T5Config.from_pretrained(MODEL['name'])
config.num_decoder_layers = MODEL['num_decoder_layers']
model = T5ForConditionalGeneration.from_pretrained(MODEL['name'], config=config)
freezeLayer(model, MODEL['num_freeze'])

#optimizer = getOptimizer(model)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


assert model

In [9]:
train_data, val_data = train_validate_test_split(data)
train_dataset = myDataset(df = train_data, tokenizer = tokenizer)
val_dataset = myDataset(df = val_data, tokenizer = tokenizer)

assert_data = train_dataset.__getitem__(121)
assert_inputs = assert_data['input_ids']
assert assert_inputs[-1] == 1
assert_label = assert_data['label']
assert assert_label[-1] == 1

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
tokenizer

PreTrainedTokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_len=512, is_fast=True, padding_side='left', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45

In [None]:
#lr_scheduler = AdafactorSchedule(optimizer)
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)
trainer.train()
trainer.save_model()


***** Running training *****
  Num examples = 27974
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 437


Epoch,Training Loss,Validation Loss


In [None]:
assert 1 == 0

