<a href="https://colab.research.google.com/github/guptanmol02/LLM-Hate-speech/blob/main/hate_speech_detection_LLM_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install libraries

In [None]:
!pip install -q transformers datasets shap scikit-learn pandas matplotlib streamlit torch>=1.13.0 requests


Imports and helper utilities

In [None]:
import os
import json
import re
import random
from pathlib import Path
from pprint import pprint
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
import matplotlib.pyplot as plt
import requests
import warnings
warnings.filterwarnings("ignore")


files upload

In [None]:
from google.colab import files
import pandas as pd, json


In [None]:
uploaded = files.upload()
with open("MMHS150K_GT.json", "r", encoding="utf-8") as f:
    gt = json.load(f)


rows = []
for tid, meta in gt.items():
    rows.append({
        "tweet_id": str(tid),
        "tweet_text": meta.get("tweet_text",""),
        "labels": meta.get("labels", []),
        "labels_str": meta.get("labels_str", [])
    })
df = pd.DataFrame(rows)
print("GT loaded:", df.shape)
df.head()


Saving MMHS150K_GT.json to MMHS150K_GT.json
GT loaded: (149823, 4)


Unnamed: 0,tweet_id,tweet_text,labels,labels_str
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]"
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]"
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]"
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]"
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]"


In [None]:
uploaded = files.upload()
with open("train_ids.txt", "r", encoding="utf-8") as f:
    train_ids = [l.strip() for l in f if l.strip()]
print("Train IDs:", len(train_ids))


Saving train_ids.txt to train_ids.txt
Train IDs: 134823


In [None]:
uploaded = files.upload()
with open("val_ids.txt", "r", encoding="utf-8") as f:
    val_ids = [l.strip() for l in f if l.strip()]
print("Val IDs:", len(val_ids))


Saving val_ids.txt to val_ids.txt
Val IDs: 5000


In [None]:
uploaded = files.upload()
with open("test_ids.txt", "r", encoding="utf-8") as f:
    test_ids = [l.strip() for l in f if l.strip()]
print("Test IDs:", len(test_ids))


Saving test_ids.txt to test_ids.txt
Test IDs: 10000


In [None]:
uploaded = files.upload()
with open("hatespeech_keywords.txt", "r", encoding="utf-8") as f:
    keywords = [l.strip() for l in f if l.strip()]
print("Keywords loaded:", len(keywords))
print(keywords[:20])


Saving hatespeech_keywords.txt to hatespeech_keywords.txt
Keywords loaded: 86
['asian drive', 'feminazi', 'sjw', 'WomenAgainstFeminism', 'blameonenotall', 'islam terrorism', 'notallmen', 'victimcard', 'victim card', 'arab terror', 'gamergate', 'jsil', 'racecard', 'race card', 'refugeesnotwelcome', 'DeportallMuslims', 'banislam', 'banmuslims', 'destroyislam', 'norefugees']


splits file

In [None]:
def load_split_file(path):
    if not Path(path).exists(): return None
    with open(path, "r", encoding="utf-8") as f:
        return [l.strip() for l in f if l.strip()]

DATA_DIR = "." # Define DATA_DIR
train_ids = load_split_file(Path(DATA_DIR)/"train_ids.txt")
val_ids   = load_split_file(Path(DATA_DIR)/"val_ids.txt")
test_ids  = load_split_file(Path(DATA_DIR)/"test_ids.txt")

# Create binary ground truth label: majority vote of the 3 annotators: 0 => NotHate, any other => Hate
def majority_label(lbls):
    # lbls is a list of ints (0..5). We'll take the modal label; if tie, pick max.
    if not lbls:
        return 0
    vals, counts = np.unique(lbls, return_counts=True)
    return int(vals[np.argmax(counts)])

df["major_label"] = df["labels"].apply(majority_label)
# Convert to binary
df["label"] = (df["major_label"] != 0).astype(int)  # 1 => hate, 0 => not hate

print("Counts by binary label:")
print(df["label"].value_counts())

# If splits exist, filter accordingly
if train_ids and val_ids and test_ids:
    train_df = df[df["tweet_id"].isin(train_ids)].reset_index(drop=True)
    val_df = df[df["tweet_id"].isin(val_ids)].reset_index(drop=True)
    test_df = df[df["tweet_id"].isin(test_ids)].reset_index(drop=True)
    print("Using provided splits:", train_df.shape, val_df.shape, test_df.shape)
else:
    # create stratified split 80/10/10
    stratify_col = df["label"]
    train_df, rest_df = train_test_split(df, test_size=0.2, stratify=stratify_col, random_state=42)
    val_df, test_df = train_test_split(rest_df, test_size=0.5, stratify=rest_df["label"], random_state=42)
    print("Created splits:", train_df.shape, val_df.shape, test_df.shape)

Counts by binary label:
label
0    124003
1     25820
Name: count, dtype: int64
Using provided splits: (134823, 6) (5000, 6) (10000, 6)


Basic cleaning + exact duplicate removal + optional near-duplicates (TF-IDF cos)

In [None]:

def clean_text(s):
    if not isinstance(s, str): return ""
    s = s.strip()
    s = re.sub(r"http\S+|www\.\S+", " ", s)  # remove URLs
    s = re.sub(r"@\w+", " ", s)               # remove mentions
    s = re.sub(r"#", " ", s)                  # optional: remove #
    s = re.sub(r"\s+", " ", s)
    return s.strip()

for d in (train_df, val_df, test_df):
    d["clean_text"] = d["tweet_text"].fillna("").astype(str).apply(clean_text)

# drop exact duplicates within train (and optionally across)
before = len(train_df)
train_df = train_df.drop_duplicates(subset=["clean_text"]).reset_index(drop=True)
print("Dropped exact duplicates in train:", before - len(train_df))

# Optional near-duplicate removal using TF-IDF + cosine between small batches (safe for small datasets)
REMOVE_NEAR_DUPLICATES = False
if REMOVE_NEAR_DUPLICATES:
    vec = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
    X = vec.fit_transform(train_df["clean_text"].values)
    # naive: compute pairwise and drop if cos>0.95 (O(N^2) - careful)
    from sklearn.metrics.pairwise import cosine_similarity
    cos = cosine_similarity(X)
    to_drop = set()
    thresh = 0.95
    n = cos.shape[0]
    for i in range(n):
        for j in range(i+1,n):
            if cos[i,j] > thresh:
                to_drop.add(j)
    if to_drop:
        train_df = train_df.drop(train_df.index[list(to_drop)]).reset_index(drop=True)
        print("Removed", len(to_drop), "near-duplicates")


Dropped exact duplicates in train: 9714


Build final balanced dataset (oversample minority)

In [None]:
# Cell 7: balancing (simple oversample minority in training set)
TARGET_BALANCE = True
if TARGET_BALANCE:

    counts = train_df["label"].value_counts()
    print("Before balancing:", counts.to_dict())
    max_count = counts.max()
    parts = []
    for lbl in counts.index:
        subset = train_df[train_df["label"] == lbl]
        if len(subset) < max_count:
            up = resample(subset, replace=True, n_samples=max_count, random_state=42)
            parts.append(up)
        else:
            parts.append(subset)
    train_df = pd.concat(parts).sample(frac=1, random_state=42).reset_index(drop=True)
    print("After balancing:", train_df["label"].value_counts().to_dict())


Before balancing: {0: 105975, 1: 19134}
After balancing: {0: 105975, 1: 105975}


Download  English bad-words list and prepare masking

In [None]:
# Cell 8: get bad-words list (LDNOOBW) + local hatespeech_keywords.txt, prepare mask function
ldnoobw_en_url = "https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
try:
    r = requests.get(ldnoobw_en_url, timeout=10)
    repo_badwords = set([w.strip().lower() for w in r.text.splitlines() if w.strip() and not w.startswith("#")])
    print("Fetched LDNOOBW English list items:", len(repo_badwords))
except Exception as e:
    print("Could not fetch LDNOOBW list:", e)
    repo_badwords = set()

local_kw_path = Path(DATA_DIR) / "hatespeech_keywords.txt"
local_bad = set()
if local_kw_path.exists():
    with open(local_kw_path, "r", encoding="utf-8") as f:
        for line in f:
            w=line.strip()
            if w:
                local_bad.add(w.lower())
    print("Loaded local hatespeech_keywords:", len(local_bad))
else:
    print("No local hatespeech_keywords.txt found at", local_kw_path)

# union lists
badwords = set([w for w in repo_badwords if w]) | local_bad
print("Total badwords used for masking:", len(badwords))

# simple token-based masker (keeps punctuation)
def mask_text(text, badwords_set):
    # replace whole-word tokens that match (case-insensitive)
    def repl(m):
        tok = m.group(0)
        if tok.lower() in badwords_set:
            return "*" * len(tok)
        return tok
    return re.sub(r"\b\w+\b", repl, text, flags=re.IGNORECASE)

# quick test
print(mask_text("u r nigga , you @someone! that's crazy", badwords))


Fetched LDNOOBW English list items: 403
Loaded local hatespeech_keywords: 72
Total badwords used for masking: 467
u r ***** , you @someone! that's crazy


Small helper to optionally run on small subset (

In [None]:

USE_SMALL_SUBSET = True
MAX_TRAIN = 4000
if USE_SMALL_SUBSET:
    print("Using small subset for quick experiments:", MAX_TRAIN)
    train_df = train_df.sample(min(len(train_df), MAX_TRAIN), random_state=42).reset_index(drop=True)
    print("Train shape now:", train_df.shape)


Using small subset for quick experiments: 4000
Train shape now: (4000, 7)


Prepare HuggingFace Datasets and Tokenizer/Model (DistilBERT)

In [None]:

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def hf_dataset_from_df(df):
    ds = Dataset.from_pandas(df[["tweet_id","clean_text","label"]].rename(columns={"clean_text":"text"}))
    return ds

train_ds = hf_dataset_from_df(train_df)
val_ds = hf_dataset_from_df(val_df)
test_ds = hf_dataset_from_df(test_df)

dataset_dict = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})
print(dataset_dict)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
dataset_tokenized = dataset_dict.map(tokenize_fn, batched=True)
dataset_tokenized = dataset_tokenized.remove_columns(["tweet_id","text"])
dataset_tokenized.set_format("torch")
print("Tokenized datasets:", dataset_tokenized)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 10000
    })
})


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenized datasets: DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})
