In [None]:
!pip install transformers  --quiet
!pip install sentencepiece==0.1.94 --quiet

In [None]:
import os
import pandas as pd
import random
from transformers import TrainingArguments, Trainer, T5Tokenizer, T5ForConditionalGeneration

import torch
from torch.utils.data import Dataset


from transformers import TrainingArguments

MODEL = {
    'name': 't5-small'
}

strategy = 'epoch'

training_args = TrainingArguments(
    output_dir="/content/",
    overwrite_output_dir=True,
    save_strategy=strategy,
    disable_tqdm=False,
    debug="underflow_overflow",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,
    evaluation_strategy=strategy,
    fp16=False,
    warmup_steps=100,
    learning_rate=5e-3,
    adam_epsilon=1e-8,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=False,
)


def freezeLayer(model):
    for param in model.encoder.parameters():
        param.requires_grad = False

def tokenConfig(tokenizer):
    tokenizer.padding_side = "left"


In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/duong-sau/chatbot1212/master/Model/Data/IntentClassification/POS/learn_data.csv", header=0)
data = data.astype(str)



In [None]:
from torch.utils.data import Dataset


class myDataset(Dataset):
    def __init__(self, tokenizer, df, max_len=512):
        self.data_column = df["source"].values + '</s>'
        self.class_column = df['target'].values + '</s>'
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data_column)

    def __getitem__(self, index):
        tokenized_inputs = self.tokenizer.encode_plus(self.data_column[index], max_length=self.max_len,
                                                      padding='longest', return_tensors="pt")
        tokenized_targets = self.tokenizer.encode_plus(self.class_column[index], max_length=4, pad_to_max_length=True,
                                                       return_tensors="pt")
        source_ids = tokenized_inputs["input_ids"].squeeze()
        target_ids = tokenized_targets["input_ids"].squeeze()
        src_mask = tokenized_inputs["attention_mask"].squeeze()
        return {"input_ids": source_ids, "attention_mask": src_mask,
                "label": target_ids}

In [None]:
import numpy as np

def train_validate_test_split(df, train_percent=.8):
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.iloc[perm[:train_end]]
    test = df.iloc[perm[train_end:]]
    return train, test

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenConfig(tokenizer=tokenizer)
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.cuda()

In [None]:
train_data, val_data = train_validate_test_split(data)
train_dataset = myDataset(df = train_data, tokenizer = tokenizer)
val_dataset = myDataset(df = val_data, tokenizer = tokenizer)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

In [None]:
a = train_dataset.__getitem__(1)
b = tokenizer.decode(a['input_ids'])
b

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

In [None]:
model.eval()
model.to('cpu')



In [None]:
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
model.to('cpu')

test_df = pd.read_csv("https://raw.githubusercontent.com/duong-sau/chatbot1212/master/Model/Data/IntentClassification/POS/test.csv", header=0)
columns = ["test_id", "expected", "actual"]
result_df = pd.DataFrame(columns=columns)
task_prefix = 'stsb '
tqdm.pandas()
for index, row in tqdm(test_df.iterrows(), leave=False):
    temp_df = pd.read_csv("https://raw.githubusercontent.com/duong-sau/chatbot1212/master/Model/Data/IntentClassification/sentence_list.csv", header=0)
    test_sentence = row["sentence"]
    for i, r in temp_df.iterrows():
        compare_sentences = r["sentence"]
        T5_format_sentence = task_prefix + "sentence1: " + test_sentence + ". sentence2: " + compare_sentences
        inputs = tokenizer(T5_format_sentence, return_tensors="pt", padding=True)
        output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
                                          do_sample=False)
        similarity = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
        temp_df.loc[i, "similarity"] = similarity
    temp_df['similarity'] = pd.to_numeric(temp_df['similarity'], errors='coerce')
    mean_df = temp_df.groupby(["intent_index"])["similarity"].mean().reset_index()
    max_row = mean_df.iloc[mean_df["similarity"].idxmax()]
    new_row = {'test_id': row["sentence_index"], 'expected': max_row["intent_index"], 'actual':row["intent_index"]}
    result_df = result_df.append(new_row, ignore_index=True)
result_df.to_csv(path_or_buf='T5Identity.csv', mode='a')


In [None]:
result_df