In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
BASE_DIR = "/content/drive/MyDrive/hate_speech"   # change if your folder name differs
TRAIN_PATH = os.path.join(BASE_DIR, "hate_speech_train.csv")
TEST_PATH  = os.path.join(BASE_DIR, "hate_speech_test.csv")

OUT_DIR = os.path.join(BASE_DIR, "roberta_output")
BEST_DIR = os.path.join(BASE_DIR, "roberta_best")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets accelerate scikit-learn



In [None]:
import transformers
print(transformers.__version__)

5.2.0


In [None]:
import re

URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
HANDLE_RE = re.compile(r"@\w+")
HASHTAG_RE = re.compile(r"#(\w+)")

def clean_for_transformer(t: str) -> str:
    t = "" if t is None else str(t)
    t = t.replace("’", "'").replace("“", '"').replace("”", '"')
    t = URL_RE.sub(" ", t)
    t = HANDLE_RE.sub(" ", t)
    t = HASHTAG_RE.sub(r"\1", t)                 # #word -> word
    t = re.sub(r"(.)\1{2,}", r"\1\1", t)          # soooo -> soo
    t = re.sub(r"\s+", " ", t).strip()
    return t

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


df = pd.read_csv(TRAIN_PATH)[["text", "label"]].dropna()
#df["text"] = df["text"].apply(clean_for_transformer)
df["label"] = df["label"].astype(int)


df_train, df_val = train_test_split(
    df, test_size=0.1, random_state=42, stratify=df["label"]
)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

print(df_train.shape, df_val.shape)
print("Positive rate train:", df_train["label"].mean())
print("Positive rate val:", df_val["label"].mean())

(13500, 2) (1500, 2)
Positive rate train: 0.3
Positive rate val: 0.3


In [None]:
from datasets import Dataset
from transformers import RobertaTokenizerFast

MODEL_NAME = "roberta-base"
MAX_LENGTH = 192

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

train_ds = Dataset.from_pandas(df_train).map(tokenize, batched=True)
val_ds   = Dataset.from_pandas(df_val).map(tokenize, batched=True)

# remove text column; HF adds __index_level_0__ sometimes, drop if present
cols_to_remove = [c for c in ["text", "__index_level_0__"] if c in train_ds.column_names]
train_ds = train_ds.remove_columns(cols_to_remove)
val_ds   = val_ds.remove_columns(cols_to_remove)

if "label" in train_ds.column_names:
    train_ds = train_ds.rename_column("label", "labels")
if "label" in val_ds.column_names:
    val_ds = val_ds.rename_column("label", "labels")

train_ds.set_format("torch")
val_ds.set_format("torch")

Map:   0%|          | 0/13500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
import torch
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, precision_score, recall_score

model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
    }

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    warmup_steps=0.06,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.318529,0.228714,0.881623,0.894737,0.868889
2,0.176859,0.176584,0.901024,0.923077,0.88
3,0.139809,0.16791,0.907818,0.955774,0.864444


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

{'eval_loss': 0.16791026294231415,
 'eval_f1': 0.9078179696616102,
 'eval_precision': 0.9557739557739557,
 'eval_recall': 0.8644444444444445,
 'eval_runtime': 3.8556,
 'eval_samples_per_second': 389.045,
 'eval_steps_per_second': 12.19,
 'epoch': 3.0}

In [None]:
import numpy as np
import torch
from sklearn.metrics import f1_score, precision_score, recall_score

# 1) Get validation logits/probs
val_pred = trainer.predict(val_ds)
val_logits = val_pred.predictions
val_probs = torch.softmax(torch.tensor(val_logits), dim=1).numpy()[:, 1]  # P(class=1)
y_val = df_val["label"].to_numpy()

# 2) Sweep thresholds
thresholds = np.linspace(0.05, 0.95, 181)  # step ~0.005
best = {"thr": 0.5, "f1": -1, "precision": None, "recall": None}

for thr in thresholds:
    y_hat = (val_probs >= thr).astype(int)
    f1 = f1_score(y_val, y_hat)
    if f1 > best["f1"]:
        best["thr"] = float(thr)
        best["f1"] = float(f1)
        best["precision"] = float(precision_score(y_val, y_hat, zero_division=0))
        best["recall"] = float(recall_score(y_val, y_hat, zero_division=0))

print("Best threshold:", best["thr"])
print("Val F1:", best["f1"], "Precision:", best["precision"], "Recall:", best["recall"])

BEST_THR = best["thr"]

NameError: name 'trainer' is not defined

In [None]:
print("Val prob range:", float(val_probs.min()), float(val_probs.max()))
print("Val positive rate true:", y_val.mean())
print("Val positive rate pred:", (val_probs >= BEST_THR).mean())

Val prob range: 0.0005072812200523913 0.9974241256713867
Val positive rate true: 0.3
Val positive rate pred: 0.268


In [None]:
trainer.save_model(BEST_DIR)
tokenizer.save_pretrained(BEST_DIR)
print("Saved to:", BEST_DIR)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to: /content/drive/MyDrive/hate_speech/roberta_best


In [None]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast, Trainer

test_df = pd.read_csv(TEST_PATH)
test_ds = Dataset.from_pandas(test_df)

tokenizer = RobertaTokenizerFast.from_pretrained(BEST_DIR)

def tokenize_test(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

test_ds = test_ds.map(tokenize_test, batched=True)
cols_to_remove = [c for c in ["text", "__index_level_0__"] if c in test_ds.column_names]
test_ds = test_ds.remove_columns(cols_to_remove)
test_ds.set_format("torch")

model = RobertaForSequenceClassification.from_pretrained(BEST_DIR)
pred_trainer = Trainer(model=model)

pred_out = pred_trainer.predict(test_ds)
logits = pred_out.predictions

probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
preds = (probs[:, 1] >= BEST_THR).astype(int)  # default threshold; tune later if you want

sub_path = os.path.join(BASE_DIR, "submission_roberta.csv")
pd.DataFrame({"id": test_df["id"].values, "label": preds}).to_csv(sub_path, index=False)

print("Saved:", sub_path)
print("Predicted positive rate:", preds.mean())