In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("SMSSpamCollection.txt", sep = "\t", header=None)
df.columns = ["label", "sms"]
df

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
model_name = "google-t5/t5-small"
d_map = "auto"

In [4]:
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map = d_map)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
pip install -q accelerate

In [6]:
df["sms"][5567]

'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

In [7]:
def process_data(df):
    data = []
    for i in range(len(df)):
        data.append({"input_ids": tokenizer.encode("classify: " + df["sms"][i], max_length=512, return_tensors="pt", truncation=True, padding="max_length").squeeze(),
                    #   "labels":tokenizer.encode(df["label"][i], max_length=512, return_tensors="pt", truncation=True, padding="max_length").squeeze(),
                      "labels":tokenizer.encode(df["label"][i], return_tensors="pt").squeeze(),
                    #   "decoder_input_ids":tokenizer.encode(df["label"][i], max_length=512, return_tensors="pt", truncation=True, padding="max_length").squeeze()
                      })
    return data

list_data = process_data(df)

In [8]:
class T5Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, key):
        return self.data[key]

    def __len__(self):
        return len(self.data)

dict_data = T5Dataset(list_data)

In [9]:
pip install -q peft

In [10]:
# from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

In [11]:
T5DataCollator  = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map = d_map)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
train_size = int(0.8 * len(dict_data))
val_size = len(dict_data) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dict_data, [train_size, val_size])


peft_training_args = TrainingArguments(
    output_dir="./fine-tuned-t6",
    auto_find_batch_size=True,
    # batch_size=128,
    learning_rate=1e-2, # Higher learning rate than full fine-tuning.
    num_train_epochs=3,
    logging_steps=100,
    # per_device_train_batch_size=4,
    save_steps=10,
    # max_steps=20_0
)

loraconfig = LoraConfig(
    r=64,
    lora_alpha = 64,
    target_modules=["q", "v"],
    lora_dropout=0.3,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(model,
                            loraconfig)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=train_dataset,
    eval_dataset =val_dataset,
    data_collator=T5DataCollator,
)

peft_trainer.train()

Step,Training Loss
100,0.6639
200,0.1969
300,0.1771
400,0.1583
500,0.1908
600,0.1527
700,0.1278
800,0.1086
900,0.1107
1000,0.0885


TrainOutput(global_step=1674, training_loss=0.14858525428863123, metrics={'train_runtime': 421.5954, 'train_samples_per_second': 31.715, 'train_steps_per_second': 3.971, 'total_flos': 1906564990500864.0, 'train_loss': 0.14858525428863123, 'epoch': 3.0})

In [15]:
peft_model_path="./fine-tuned-t7/model/"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./fine-tuned-t7/model/tokenizer_config.json',
 './fine-tuned-t7/model/special_tokens_map.json',
 './fine-tuned-t7/model/spiece.model',
 './fine-tuned-t7/model/added_tokens.json')

In [16]:
peft_model = PeftModel.from_pretrained(model,
                                       './fine-tuned-t7/model/',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False, device_map = "auto")

prom = "This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate."
input_ids = tokenizer.encode("classify: " +prom, return_tensors="pt", padding="max_length", max_length=512, truncation=True).to("cuda")

peft_model_outputs= peft_model.generate(input_ids = input_ids)
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
peft_model_text_output

'spam'