Related article: https://www.ivanlai.project-ds.net/post/conditional-text-generation-by-fine-tuning-gpt-2

Preprocessing code in [this](https://github.com/ivanlai/Conditional_Text_Generation) Github repository.

In [None]:
!nvidia-smi

### Install and import libraries

In [None]:
%%time
%%capture
!pip install transformers  
!pip install sentencepiece==0.1.94

In [None]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer, T5Tokenizer, T5ForConditionalGeneration

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

### Configurations

In [None]:
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 't5-small'     #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training
                    
MAXLEN          = 768  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/duong-sau/chatbot1212/master/Model/Data/IntentClassification/POS/learn_data.csv", header=0)
data = data.astype(str)

In [None]:
data['target']

In [None]:
data.iloc[1]['source']

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

class myDataset(Dataset):
  def __init__(self, tokenizer,df,  max_len=128):
    self.data_column = df["source"].values + '</s>'
    self.class_column = df['target'].values + '</s>'
    self.max_len = max_len
    self.tokenizer = tokenizer
        
  def __len__(self):
      return len(self.data_column)

  def __getitem__(self, index):
    # tokenize inputs
    tokenized_inputs = self.tokenizer.encode_plus( self.data_column[index], max_length=self.max_len, padding='longest', return_tensors="pt")
    tokenized_targets = self.tokenizer.encode_plus( self.class_column[index] , max_length=4, pad_to_max_length=True, return_tensors="pt")
    source_ids = tokenized_inputs["input_ids"].squeeze()
    target_ids = tokenized_targets["input_ids"].squeeze()
    src_mask    = tokenized_inputs["attention_mask"].squeeze() # might need to squeeze
    target_mask = tokenized_targets['attention_mask'].squeeze()  # might need to squeeze
    return {"input_ids": source_ids, "attention_mask": src_mask, 
                "label": target_ids}

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split


def train_validate_test_split(df, train_percent=.8, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.iloc[perm[:train_end]]
    test = df.iloc[perm[train_end:]]
    return train, test


### Loading Tokenizer, Config and Model

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer.padding_side = "left"
type(tokenizer)
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.cuda()

In [None]:
train_data, val_data = train_validate_test_split(data)
train_dataset = myDataset(df = train_data, tokenizer = tokenizer)
val_dataset = myDataset(df = val_data, tokenizer = tokenizer)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

In [None]:
a = train_dataset.__getitem__(1)
b = tokenizer.decode(a['input_ids'])
b

In [None]:
training_args = TrainingArguments(
    output_dir="/content/",
    num_train_epochs=5,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    fp16=False,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=False,     
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

In [None]:
model.eval()
model.to('cpu')

In [None]:
da = train_dataset.__getitem__(5)
da['input_ids']

In [None]:
model.cuda()
T5_format_sentence = 'stsb '  + "sentence1: " + "this is q sq" + ". sentence2: " + "compare_sentences"
inputs = tokenizer(T5_format_sentence, return_tensors="pt").to('cuda')
output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
                                          do_sample=False)
similarity = tokenizer.batch_decode(output_sequences)

In [None]:
similarity


In [None]:
output_sequences

In [None]:
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
model.to('cpu')

test_df = pd.read_csv("https://raw.githubusercontent.com/duong-sau/chatbot1212/master/Model/Data/IntentClassification/POS/test.csv", header=0)
columns = ["test_id", "expected", "actual"]
result_df = pd.DataFrame(columns=columns)
task_prefix = 'stsb '
tqdm.pandas()
for index, row in tqdm(test_df.iterrows(), leave=False):
    temp_df = pd.read_csv("https://raw.githubusercontent.com/duong-sau/chatbot1212/master/Model/Data/IntentClassification/sentence_list.csv", header=0)
    test_sentence = row["sentence"]
    for i, r in temp_df.iterrows():
        compare_sentences = r["sentence"]
        T5_format_sentence = task_prefix + "sentence1: " + test_sentence + ". sentence2: " + compare_sentences
        inputs = tokenizer(T5_format_sentence, return_tensors="pt", padding=True)
        output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
                                          do_sample=False)
        similarity = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
        temp_df.loc[i, "similarity"] = similarity
    temp_df['similarity'] = pd.to_numeric(temp_df['similarity'], errors='coerce')
    mean_df = temp_df.groupby(["intent_index"])["similarity"].mean().reset_index()
    max_row = mean_df.iloc[mean_df["similarity"].idxmax()]
    new_row = {'test_id': row["sentence_index"], 'expected': max_row["intent_index"], 'actual':row["intent_index"]}
    result_df = result_df.append(new_row, ignore_index=True)
result_df.to_csv(path_or_buf='T5Identity.csv', mode='a')


In [None]:
result_df