<a href="https://colab.research.google.com/github/cld0033/Tone_It_Down/blob/main/transformers_custom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#install if not installed; hide output
!pip install datasets -q
!pip install transformers -q
!pip install torch -q

In [None]:
#import relevant libraries
import datasets
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [None]:
#load a tokenizer and training model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
#load a dataset
dataset = datasets.load_dataset("s-nlp/en_paradetox_toxicity")

#tokenize dataset and prepare it for training
def preprocess_function(examples):
    labels=examples["toxic"]
    inputs = examples["comment"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels
    return model_inputs

In [None]:
#load a dataset
dataset = datasets.load_dataset("uhoui/text-tone-classifier")

README.md:   0%|          | 0.00/100 [00:00<?, ?B/s]

data-all.csv:   0%|          | 0.00/42.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/491 [00:00<?, ? examples/s]

In [None]:
print(dataset)
print("header: \n", dataset['train'].take(5).to_pandas())

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'label'],
        num_rows: 491
    })
})
header: 
    idx                                               text     label
0    0  I am absolutely thrilled with the service I re...       joy
1    1     It's frustrating when the meeting starts late!     anger
2    2  The news about the community event has left me...   sadness
3    3  Wow, I didn't expect to see my friends here to...  surprise
4    4  I'm really worried about the upcoming exams. I...     worry


In [None]:
#tokenize dataset and prepare it for training
def preprocess_function(examples):
    labels=examples["label"]
    inputs = examples["text"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels
    return model_inputs

In [None]:
#examine the dataset
print("object information: \n", dataset)
print("header: \n", dataset['train'].take(5).to_pandas())

object information: 
 DatasetDict({
    train: Dataset({
        features: ['comment', 'toxic'],
        num_rows: 26507
    })
})
header: 
                                              comment  toxic
0  ryan is as big a bum as the jerk in the white ...   True
1                             You sure are a racist!   True
2                   it is easy to spot those racist.  False
3  btw jonhson county is a real shithole in case ...   True
4  How many people does it take to change light b...  False


In [None]:
#run preprocess function on dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/491 [00:00<?, ? examples/s]

In [None]:
#manually created splits
num_samples = len(tokenized_dataset['train'])
train_indices, val_indices = train_test_split(range(num_samples), test_size=0.2, random_state=42)

# Create train and validation datasets using select
train_dataset = tokenized_dataset['train'].select(train_indices)
val_dataset = tokenized_dataset['train'].select(val_indices)

# Create a DatasetDict with separate splits
split_dataset = datasets.DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [None]:
#fine tune model using trainer API via hugging face
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
#verify that split happened
print("unsplit dataset: \n", tokenized_dataset)
print("split dataset: \n", split_dataset)

unsplit dataset: 
 DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 491
    })
})
split dataset: 
 DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 392
    })
    validation: Dataset({
        features: ['idx', 'text', 'label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 99
    })
})


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["validation"],
)

In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


ValueError: too many dimensions 'str'

In [None]:
#Export fine tune model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")