In [2]:
!pip install datasets
!pip install sacrebleu
!pip install -U accelerate
!pip install -U transformers


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric, Dataset
import torch
import numpy as np
import pandas as pd


df = pd.read_csv('ComMid2k_Shuffled_no_none.tsv', sep='\t')

# Renaming columns for easier access
df = df.rename(columns={'text': 'source_text', 'mitigated_text': 'target_text'})
raw_dataset = Dataset.from_pandas(df)

# Define your model and tokenizer
# pretrained_model = 'models/switch-transformer'
pretrained_model = 'google/switch-base-8'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

# Define preprocess function
def preprocess_function(examples):
    source_inputs = tokenizer(examples['source_text'], padding='max_length', truncation=True, max_length=128)
    target_inputs = tokenizer(examples['target_text'], padding='max_length', truncation=True, max_length=128)
    return {'input_ids': source_inputs.input_ids, 'labels': target_inputs.input_ids}

# Apply preprocessing
tokenized_datasets = raw_dataset.map(preprocess_function, batched=True)
split_dataset = tokenized_datasets.train_test_split(test_size=0.2)
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model).to('cuda')  # Use appropriate device ('cuda' or 'cpu')

epochs = 5
max_input_length = 128
max_target_length = 128
learning_rate = 1e-04
source_tag = "source_text"
target_tag = "target_text"
batch_size = 16
model_name = 'google/switch-base-8'  # Updated model name


args = Seq2SeqTrainingArguments(
    output_dir="models",
    num_train_epochs=epochs,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    warmup_steps=500,
    weight_decay=0.001,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=500,
    evaluation_strategy="steps",
    save_total_limit=3,
    generation_max_length=max_target_length,
    generation_num_beams=1,
    load_best_model_at_end=True,
    metric_for_best_model="rouge-1",
    report_to="all"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0].cpu().numpy() if isinstance(preds[0], torch.Tensor) else preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result



Map:   0%|          | 0/1951 [00:00<?, ? examples/s]

In [3]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

from tqdm.notebook import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainer.train()

trainer.save_model("drive/MyDrive/models/switch-transformer")

# Save the fine-tuned model
# trainer.save_model("models/fine_tuned_model")

# Load the trained model for inference
model_to_generate = AutoModelForSeq2SeqLM.from_pretrained("drive/MyDrive/models/switch-transformer").to(device)
# model_to_generate = AutoModelForSeq2SeqLM.from_pretrained("models/fine_tuned_model").to(device)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_to_generate = AutoModelForSeq2SeqLM.from_pretrained("drive/MyDrive/models/switch-transformer").to('cuda')
tokenizer = AutoTokenizer.from_pretrained("drive/MyDrive/models/switch-transformer")
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda')  # Use appropriate device ('cuda' or 'cpu')

# Generate mitigated sentences from original text
input_text = "Women are stupid"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)

outputs = model_to_generate.generate(input_ids)
mitigated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Original Text:", input_text)
print("Mitigated Text:", mitigated_text)



Original Text: Women are stupid
Mitigated Text: Mind your approach.
