# Offensive Language Detector — Training Notebook

We train and compare **three classic text classifiers** on tweets:
- **Logistic Regression**
- **Linear SVM** (probability-calibrated)
- **Complement Naive Bayes**

All models share the **same TF-IDF features (unigrams + bigrams)** so the comparison is fair. We tune a **decision threshold** on a validation set to meet **Recall ≥ 0.80** and, among thresholds that satisfy this, we pick the one with the **highest Precision** (tie-break by F1). The best model is evaluated on the test set and saved as an artifact for the FastAPI service.


In [1]:
import os, json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score,
    average_precision_score, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump

ARTIFACTS = Path('artifacts'); ARTIFACTS.mkdir(exist_ok=True)
SEED = 42
np.random.seed(SEED)

## 1) Load data and create splits

We use the **Kaggle “Hate Speech & Offensive Language”** dataset (`data/labeled_data.csv`) as the primary source.  
Labels are mapped to our binary target:
- **foul (1)** if `class ∈ {0, 1}` (hate/offensive)
- **proper (0)** if `class = 2` (neiher)


In [None]:
# --- Kaggle: Hate Speech & Offensive Language (Davidson et al.) ---
# Expect CSV with columns: tweet, class (0=hate, 1=offensive, 2=neither)
# Map to binary: foul=1 if class in {0,1}, else proper=0.

import os, re, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

ARTIFACTS = Path('artifacts'); ARTIFACTS.mkdir(exist_ok=True)
SEED = 42
np.random.seed(SEED)

def find_csv():
    """Be tolerant about where the notebook was launched from."""
    candidates = [
        Path("data/labeled_data.csv"),
        Path("../data/labeled_data.csv"),
        Path.cwd() / "data" / "labeled_data.csv",
        Path.cwd().parent / "data" / "labeled_data.csv",
        Path("labeled_data.csv"),
    ]
    for p in candidates:
        if p.exists():
            return p
    raise FileNotFoundError(f"CSV not found. Tried: {[str(p) for p in candidates]}")

def load_offensive_dataset(csv_path=None):
    csv_path = Path(csv_path) if csv_path else find_csv()
    print("Using CSV:", csv_path.resolve())

    # latin-1 is common for this file; ignore bad lines if any
    df = pd.read_csv(csv_path, encoding="latin-1", on_bad_lines="skip")

    assert 'tweet' in df.columns, f"'tweet' column missing. Columns: {list(df.columns)[:10]}"
    assert 'class' in df.columns, f"'class' column missing. Columns: {list(df.columns)[:10]}"

    df = df[['tweet','class']].dropna().copy()
    df['text'] = df['tweet'].astype(str)
    df['label'] = df['class'].apply(lambda c: 1 if c in (0,1) else 0)
    df['text'] = df['text'].str.strip()
    df = df[df['text'].str.len() > 0]

    # train/test split (we'll make validation from train)
    tr, te = train_test_split(df[['text','label']], test_size=0.20,
                              random_state=SEED, stratify=df['label'])
    return tr.reset_index(drop=True), te.reset_index(drop=True)

def basic_clean(text: str) -> str:
    t = str(text)
    t = t.replace("RT ", " ")                         # drop RT marker
    t = re.sub(r'https?://\S+|www\.\S+', '<URL>', t)  # keep URL token
    t = re.sub(r'@\w+', '<USER>', t)                  # keep mention token
    return t.strip()

# Load + clean
train_df, test_df = load_offensive_dataset()
for df_ in (train_df, test_df):
    df_['text'] = df_['text'].astype(str).map(basic_clean)

# De-duplicate AFTER cleaning
train_df = train_df[['text','label']].drop_duplicates().reset_index(drop=True)
test_df  = test_df[['text','label']].drop_duplicates().reset_index(drop=True)

# Make validation from train
train_df, val_df = train_test_split(train_df, test_size=0.20,
                                    random_state=SEED, stratify=train_df['label'])

print("Class ratio (foul=1):",
      train_df['label'].mean(), val_df['label'].mean(), test_df['label'].mean())
print("Sizes:", train_df.shape, val_df.shape, test_df.shape)


False Positives (pred foul, true proper):


Unnamed: 0,text,score
416,Up early then a bitch driving to denton omg ca...,0.996118
2780,<USER>: <USER> np big nig nig,0.995004
260,No girls no hoes just me myself and I,0.990686
4680,"I would bitch about the weather being bipolar,...",0.98078
1696,I'm jus a trill nicca mane,0.970349


False Negatives (pred proper, true foul):


Unnamed: 0,text,score
665,<USER>: Scarlett Johansson <URL>,0.003365
4440,<USER>: Trent Richardson so trash,0.011651
418,<USER> trash,0.020228
1013,<USER> monkey dong,0.025542
2771,Roy Hibbert so trash. It's unbelievable.,0.036012


Class ratio (foul=1): 0.832602757131059 0.8325711382113821 0.831783259199353
Sizes: (15741, 2) (3936, 2) (4946, 2)


## 2) Shared TF-IDF features
We use the same feature extractor for all models: unigrams + bigrams, Unicode accent stripping, and sensible frequency cutoffs.

In [3]:
tfidf = TfidfVectorizer(
    lowercase=True,
    strip_accents='unicode',
    ngram_range=(1,2),
    min_df=2, max_df=0.95
)
tfidf

## 3) Helper functions for threshold tuning and metrics

In [4]:
def choose_threshold_for_recall(y_true, scores, target_recall=0.80):
    """Pick threshold with Recall ≥ target; among those, maximize Precision (tie-break by F1)."""
    best = {'thr':0.5, 'prec':0.0, 'rec':0.0, 'f1':0.0}
    for t in np.unique(np.round(scores, 4)):
        y_hat = (scores >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
        if r >= target_recall:
            if (p > best['prec']) or (p==best['prec'] and f1>best['f1']):
                best = {'thr':float(t), 'prec':float(p), 'rec':float(r), 'f1':float(f1)}
    return best

def metrics_summary(y_true, y_hat, scores=None):
    acc = accuracy_score(y_true, y_hat)
    p_b, r_b, f1_b, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
    p_m, r_m, f1_m, _ = precision_recall_fscore_support(y_true, y_hat, average='macro', zero_division=0)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(y_true, y_hat, average='weighted', zero_division=0)
    out = dict(accuracy=acc,
               precision_binary=p_b, recall_binary=r_b, f1_binary=f1_b,
               precision_macro=p_m,  recall_macro=r_m, f1_macro=f1_m,
               precision_weighted=p_w, recall_weighted=r_w, f1_weighted=f_w)
    if scores is not None:
        out['roc_auc'] = roc_auc_score(y_true, scores)
        out['pr_auc']  = average_precision_score(y_true, scores)
    return out

## 4) Build the three model pipelines

In [5]:
# Logistic Regression
pipe_logreg = Pipeline([
    ('tfidf', tfidf),
    ('clf', LogisticRegression(max_iter=2000, solver='liblinear', class_weight='balanced'))
])

# Linear SVM + Platt scaling (probabilities via calibration)
svm_base = LinearSVC(class_weight='balanced')
pipe_svm = Pipeline([
    ('tfidf', tfidf),
    ('clf', CalibratedClassifierCV(svm_base, method='sigmoid', cv=3))
])

# Complement Naive Bayes
pipe_cnb = Pipeline([
    ('tfidf', tfidf),
    ('clf', ComplementNB())
])

models = {
    'logreg': pipe_logreg,
    'linear_svm': pipe_svm,
    'comp_nb': pipe_cnb
}
list(models.keys())

['logreg', 'linear_svm', 'comp_nb']

## 5) Train each model and tune its decision threshold on the validation set

In [6]:
val_scores_table = {}
for name, pipe in models.items():
    print(f"Training {name} ...")
    pipe.fit(train_df['text'], train_df['label'])
    try:
        scores_val = pipe.predict_proba(val_df['text'])[:,1]
    except Exception:
        z = pipe.decision_function(val_df['text'])
        scores_val = 1.0/(1.0+np.exp(-z))
    choice = choose_threshold_for_recall(val_df['label'].values, scores_val, 0.80)
    preds_val = (scores_val >= choice['thr']).astype(int)
    metrics = metrics_summary(val_df['label'].values, preds_val, scores=scores_val)
    metrics.update(threshold=choice['thr'])
    val_scores_table[name] = metrics
    print(name, json.dumps(metrics, indent=2))

val_scores_table

# Save validation comparison across the 3 models
val_table = (pd.DataFrame(val_scores_table)
             .T
             .sort_values(['precision_binary','f1_binary'], ascending=False))
val_table.to_csv('artifacts/validation_model_compare.csv')
val_table.head()

Training logreg ...
logreg {
  "accuracy": 0.8328252032520326,
  "precision_binary": 0.9969639468690702,
  "recall_binary": 0.8016478486420506,
  "f1_binary": 0.888700947225981,
  "precision_macro": 0.7486741333115527,
  "recall_macro": 0.8947541215896141,
  "f1_macro": 0.7764933307558477,
  "precision_weighted": 0.9138221851137837,
  "recall_weighted": 0.8328252032520326,
  "f1_weighted": 0.8511273602067647,
  "roc_auc": 0.9769321564794033,
  "pr_auc": 0.9948078034771524,
  "threshold": 0.7462
}
Training linear_svm ...
linear_svm {
  "accuracy": 0.8373983739837398,
  "precision_binary": 0.9969845457972107,
  "recall_binary": 0.8071406774488862,
  "f1_binary": 0.8920741989881956,
  "precision_macro": 0.7521945332259632,
  "recall_macro": 0.8975005359930319,
  "f1_macro": 0.7812585207093399,
  "precision_weighted": 0.9150147194331598,
  "recall_weighted": 0.8373983739837398,
  "f1_weighted": 0.8549667132230706,
  "roc_auc": 0.9798332332349946,
  "pr_auc": 0.9951646492729794,
  "threshol

Unnamed: 0,accuracy,precision_binary,recall_binary,f1_binary,precision_macro,recall_macro,f1_macro,precision_weighted,recall_weighted,f1_weighted,roc_auc,pr_auc,threshold
linear_svm,0.837398,0.996985,0.807141,0.892074,0.752195,0.897501,0.781259,0.915015,0.837398,0.854967,0.979833,0.995165,0.9478
logreg,0.832825,0.996964,0.801648,0.888701,0.748674,0.894754,0.776493,0.913822,0.832825,0.851127,0.976932,0.994808,0.7462
comp_nb,0.82749,0.988713,0.801953,0.885594,0.740444,0.878215,0.767526,0.905579,0.82749,0.846058,0.959215,0.990885,0.8377


## 6) Pick a winner by precision (tie-break by F1), then evaluate on the test set

In [7]:
cands = [(n, m['precision_binary'], m['f1_binary']) for n,m in val_scores_table.items()]
cands.sort(key=lambda x: (x[1], x[2]), reverse=True)
best_name = cands[0][0]
best_threshold = val_scores_table[best_name]['threshold']
best_model = models[best_name]
print('Winner on validation:', best_name, 'with threshold', best_threshold)

try:
    scores_test = best_model.predict_proba(test_df['text'])[:,1]
except Exception:
    z = best_model.decision_function(test_df['text'])
    scores_test = 1.0/(1.0+np.exp(-z))
preds_test = (scores_test >= best_threshold).astype(int)
test_metrics = metrics_summary(test_df['label'].values, preds_test, scores=scores_test)
print('Test metrics for', best_name, json.dumps(test_metrics, indent=2))

from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
cm = confusion_matrix(test_df['label'].values, preds_test)
fig = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix — {best_name}')
plt.xlabel('Predicted'); plt.ylabel('True')
plt.tight_layout(); plt.savefig(ARTIFACTS/'fig_cm.png', dpi=160); plt.close()
RocCurveDisplay.from_predictions(test_df['label'].values, scores_test)
plt.title(f'ROC — {best_name}')
plt.tight_layout(); plt.savefig(ARTIFACTS/'fig_roc.png', dpi=160); plt.close()
PrecisionRecallDisplay.from_predictions(test_df['label'].values, scores_test)
plt.title(f'PR — {best_name}')
plt.tight_layout(); plt.savefig(ARTIFACTS/'fig_pr.png', dpi=160); plt.close()

pd.DataFrame([test_metrics]).to_csv('artifacts/test_metrics_winner.csv', index=False)

Winner on validation: linear_svm with threshold 0.9478
Test metrics for linear_svm {
  "accuracy": 0.8358269308532147,
  "precision_binary": 0.9969897652016857,
  "recall_binary": 0.8050559066601848,
  "f1_binary": 0.8908015061861216,
  "precision_macro": 0.7515737003348331,
  "recall_macro": 0.8965183379454771,
  "f1_macro": 0.7800913068389892,
  "precision_weighted": 0.9144235840576416,
  "recall_weighted": 0.8358269308532147,
  "f1_weighted": 0.8535548883709926,
  "roc_auc": 0.9811894948300364,
  "pr_auc": 0.9960326156380565
}


## 7) Save the chosen pipeline + threshold for the FastAPI service

In [8]:
artifact = {
    'pipeline': best_model,
    'threshold': float(best_threshold),
    'label_map': {0: 'proper', 1: 'foul'},
    'model_name': best_name,
    'val_results': val_scores_table,
    'test_metrics': test_metrics
}
dump(artifact, ARTIFACTS/'model.joblib')
json.dump(test_metrics, open(ARTIFACTS/'metrics.json','w'), indent=2)
print('Saved:', ARTIFACTS/'model.joblib')

Saved: artifacts\model.joblib
