In [10]:
import torch
!pip install transformers  --quiet
!pip install sentencepiece==0.1.94 --quiet

In [11]:
import pandas as pd
import numpy as np
from transformers import TrainingArguments, Trainer, AutoTokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset

In [12]:
### Config
MODEL = {
    'name': 't5-small',
    'data_link': "https://raw.githubusercontent.com/duong-sau/chatbot1212/master/Model/Data/IntentClassification/POS/learn_data.csv"
}
strategy = 'epoch'
training_args = TrainingArguments(
    output_dir="/content/",
    overwrite_output_dir=True,
    save_strategy=strategy,
    disable_tqdm=False,
    debug="underflow_overflow",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,
    evaluation_strategy=strategy,
    fp16=False,
    warmup_steps=100,
    learning_rate=5e-4,
    adam_epsilon=1e-8,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=False,
)

def freezeLayer(model):
    for param in model.encoder.parameters():
        param.requires_grad = False

def tokenConfig(tokenizer):
    assert tokenizer
    tokenizer.padding_side = "left"

def train_validate_test_split(df, train_percent=.8):
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.iloc[perm[:train_end]]
    test = df.iloc[perm[train_end:]]
    return train, test

In [13]:
data = pd.read_csv(MODEL['data_link'], header=0)
data = data.astype(str)

In [14]:
class myDataset(Dataset):
    def __init__(self, tokenizer, df, max_len=512):
        self.data_column = df["source"].values + '</s>'
        self.class_column = df['target'].values + '</s>'
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data_column)

    def __getitem__(self, index):
        tokenized_inputs = self.tokenizer.encode_plus(self.data_column[index], max_length=self.max_len,
                                                      padding='longest', return_tensors="pt")
        tokenized_targets = self.tokenizer.encode_plus(self.class_column[index], max_length=4, pad_to_max_length=True,
                                                       return_tensors="pt")
        source_ids = tokenized_inputs["input_ids"].squeeze()
        target_ids = tokenized_targets["input_ids"].squeeze()
        src_mask = tokenized_inputs["attention_mask"].squeeze()
        return {"input_ids": source_ids, "attention_mask": src_mask,
                "label": target_ids}


In [15]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenConfig(tokenizer=tokenizer)
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [16]:
train_data, val_data = train_validate_test_split(data)
train_dataset = myDataset(df = train_data, tokenizer = tokenizer)
val_dataset = myDataset(df = val_data, tokenizer = tokenizer)

assert_data = train_dataset.__getitem__(1211)
assert_inputs = assert_data['input_ids']
assert assert_inputs[-1] == 1
assert_label = assert_data['label']
assert assert_label[-1] == 1

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 27974
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 2185


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 