In [None]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the dataset
file_path = "t5_rephrase_dataset.csv"  # Replace with your dataset file
df = pd.read_csv(file_path)

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into train and validation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Validate dataset before tokenization
print("Sample Input Text:", train_dataset[0]["Input Text"])
print("Sample Target Text:", train_dataset[0]["Target Text"])

# Tokenization function
def preprocess_function(examples):
    # Ensure all data is converted to strings
    inputs = [str(text) for text in examples["Input Text"]]
    targets = [str(text) for text in examples["Target Text"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Clean the dataset if necessary
dataset = dataset.filter(lambda x: isinstance(x["Input Text"], str) and isinstance(x["Target Text"], str))

# Load T5 tokenizer and model
model_name = "t5-small"  # Can also use "t5-base" for larger models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)


# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5_rephrase_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    load_best_model_at_end=True,

    fp16=True  # Enable if using GPU with mixed precision
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./t5_rephrase_model")
tokenizer.save_pretrained("./t5_rephrase_model")

Sample Input Text: rephrase: never give a bitch the impression that u need them
Sample Target Text: Rephrased version of: never give a bitch the impression that u need them


Filter:   0%|          | 0/20620 [00:00<?, ? examples/s]

Map:   0%|          | 0/16496 [00:00<?, ? examples/s]

Map:   0%|          | 0/4124 [00:00<?, ? examples/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0035,0.00102
2,0.0024,0.000822
3,0.0017,0.000697


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./t5_rephrase_model/tokenizer_config.json',
 './t5_rephrase_model/special_tokens_map.json',
 './t5_rephrase_model/spiece.model',
 './t5_rephrase_model/added_tokens.json')

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load your dataset
file_path = "/content/large_hate_speech_rephrasing_dataset.csv"  # Replace with the path to your dataset
df = pd.read_csv(file_path)

# Ensure dataset has the correct columns
if "Hateful Sentence" not in df.columns or "Neutral Rephrased Sentence" not in df.columns:
    raise ValueError("Dataset must have 'Hateful Sentence' and 'Neutral Rephrased Sentence' columns.")

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into train and validation
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Load T5 tokenizer and model
model_name = "t5-small"  # You can replace with "t5-base" for a larger model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization function
def preprocess_function(examples):
    inputs = [f"rephrase: {text}" for text in examples["Hateful Sentence"]]
    targets = [text for text in examples["Neutral Rephrased Sentence"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5_rephrase_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,

    fp16=True  # Enable this if you are using a GPU with mixed precision
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/t5_rephrase_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/t5_rephrase_model")

print("Model training completed and saved to './t5_rephrase_model'")


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.025,0.022978
2,0.0241,0.022897
3,0.0242,0.022797


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Model training completed and saved to './t5_rephrase_model'
