In [39]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
import re
from sklearn.metrics import accuracy_score, confusion_matrix
#%pip install xgboost

In [40]:
#%cd /Users/nolenhuang/Hate Speech/Hate-Speech-Detection-English
from hate_preproc import PreprocessConfig, preprocess_dataframe, add_signal_columns

df = pd.read_csv("hate_speech_train.csv") # might need to update to train.csv
cfg = PreprocessConfig()
df_clean = preprocess_dataframe(df, cfg)

/Users/nolenhuang/Hate Speech/Hate-Speech-Detection-English


Stratified 5-Fold Cross Validation

In [41]:
#use label(0/1) as y
y = df["label"].to_numpy()

# X as dummy to fit the length
X_dummy = np.zeros(len(y))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_dummy, y), start=1):
    y_train, y_val = y[train_idx], y[val_idx]
    print(
        f"Fold {fold}: "
        f"train={len(train_idx)}, val={len(val_idx)}, "
        f"pos_rate_train={y_train.mean():.4f}, pos_rate_val={y_val.mean():.4f}"
    )

Fold 1: train=12000, val=3000, pos_rate_train=0.3000, pos_rate_val=0.3000
Fold 2: train=12000, val=3000, pos_rate_train=0.3000, pos_rate_val=0.3000
Fold 3: train=12000, val=3000, pos_rate_train=0.3000, pos_rate_val=0.3000
Fold 4: train=12000, val=3000, pos_rate_train=0.3000, pos_rate_val=0.3000
Fold 5: train=12000, val=3000, pos_rate_train=0.3000, pos_rate_val=0.3000


Count keyword for feature engineering

In [42]:
def generate_key_list_hate(df_in, size_table=200, ignore=3,
                           text_col="text_clean", label_col="label",
                           pos_label=1):
    # dicts: total token counts per class, and document frequency
    dict_pos = {}
    dict_neg = {}
    dict_df  = {}  # document frequency (how many docs contain the word)

    n_docs = df_in.shape[0]

    for i in range(n_docs):
        text = "" if pd.isna(df_in.iloc[i][text_col]) else str(df_in.iloc[i][text_col])
        finds = re.findall(r"[A-Za-z]+", text)

        is_pos = (df_in.iloc[i][label_col] == pos_label)

        # token counts (TF numerator)
        for w in finds:
            if len(w) < ignore:
                continue
            w = w.lower()
            if is_pos:
                dict_pos[w] = dict_pos.get(w, 0) + 1
                dict_neg[w] = dict_neg.get(w, 0)
            else:
                dict_neg[w] = dict_neg.get(w, 0) + 1
                dict_pos[w] = dict_pos.get(w, 0)

        # document frequency (IDF denominator)
        word_set = set()
        for w in finds:
            if len(w) < ignore:
                continue
            w = w.lower()
            if w not in word_set:
                dict_df[w] = dict_df.get(w, 0) + 1
                word_set.add(w)

    # Build table
    word_df = pd.DataFrame({
        "keyword": list(dict_df.keys()),
        "neg_cnt": [dict_neg.get(k, 0) for k in dict_df.keys()],
        "pos_cnt": [dict_pos.get(k, 0) for k in dict_df.keys()],
        "df":      [dict_df.get(k, 0)  for k in dict_df.keys()],
    })

    # class sizes
    n_pos = (df_in[label_col] == pos_label).sum()
    n_neg = n_docs - n_pos

    # normalized TF (per document count in that class)
    word_df["neg_tf"] = word_df["neg_cnt"].astype(float) / max(n_neg, 1)
    word_df["pos_tf"] = word_df["pos_cnt"].astype(float) / max(n_pos, 1)

    # IDF (same style as your example)
    word_df["idf"] = np.log10(word_df.shape[0] / word_df["df"].astype(float).clip(lower=1.0))

    # TF-IDF
    word_df["neg_tfidf"] = word_df["neg_tf"] * word_df["idf"]
    word_df["pos_tfidf"] = word_df["pos_tf"] * word_df["idf"]

    # diff: "more hate" words rank higher
    word_df["diff"] = word_df["pos_tfidf"] - word_df["neg_tfidf"]

    selected = word_df.sort_values("diff", ascending=False).head(size_table)

    keyword_dict = {w.strip(): idx for idx, w in enumerate(selected["keyword"].tolist())}
    return keyword_dict, selected, word_df


view keywords

In [43]:
size_table = 300
word_len_ignored = 3

keyword_dict, top_words, word_table = generate_key_list_hate(
    df_clean,
    size_table=size_table,
    ignore=word_len_ignored,
    text_col="text_clean",
    label_col="label",
    pos_label=1
)

print(len(keyword_dict))
top_words.head(20)


300


Unnamed: 0,keyword,neg_cnt,pos_cnt,df,neg_tf,pos_tf,idf,neg_tfidf,pos_tfidf,diff
208,bitch,567,1631,2083,0.054,0.362444,0.947989,0.051191,0.343593,0.292402
95,bitches,248,617,824,0.023619,0.137111,1.350751,0.031903,0.185203,0.1533
78,hoes,192,487,648,0.018286,0.108222,1.455103,0.026608,0.157475,0.130867
368,pussy,135,413,524,0.012857,0.091778,1.547347,0.019894,0.142012,0.122118
31,hoe,159,390,516,0.015143,0.086667,1.554029,0.023532,0.134682,0.11115
110,nigga,56,288,317,0.005333,0.064,1.765619,0.009417,0.113,0.103583
650,ass,158,338,457,0.015048,0.075111,1.606762,0.024178,0.120686,0.096508
648,fuck,129,308,409,0.012286,0.068444,1.654955,0.020332,0.113272,0.09294
364,don,102,274,357,0.009714,0.060889,1.71401,0.01665,0.104364,0.087714
232,shit,126,258,366,0.012,0.057333,1.703197,0.020438,0.09765,0.077212


'neg_cnt': # in 'label = 0' text
'pos_cnt': # in 'label = 1' text
'df': # appear in text (one count per text)
'neg_tf': frequency rate in "label = 0'
'pos_tf': frequency rate in "label = 1'
'diff'ï¼š'pos_tfidf - neg_tfidf'

Text to Vector

In [44]:
def convert_text_to_vec(text, keyword_dict):
    m = len(keyword_dict)
    res = np.zeros(m, dtype=np.int32)

    text = "" if text is None else str(text)
    finds = re.findall(r"[A-Za-z]+", text)

    for w in finds:
        w = w.lower()
        if w in keyword_dict:
            res[keyword_dict[w]] = 1
    return res


In [45]:
def df_to_features(df_in, keyword_dict, text_col="text_clean", label_col="label", has_label=True):
    n = df_in.shape[0]
    m = len(keyword_dict)

    X = np.zeros((n, m), dtype=np.int32)

    for i in range(n):
        X[i, :] = convert_text_to_vec(df_in.iloc[i][text_col], keyword_dict)

    if has_label:
        y = df_in[label_col].to_numpy().astype(int)
        return X, y
    else:
        return X

In [46]:
# Train features
X, y = df_to_features(df_clean, keyword_dict, text_col="text_clean", has_label=True)

In [47]:
#df_test = pd.read_csv("hate_speech_test.csv")
#df_test_clean = preprocess_dataframe(df_test, cfg)

#X_test = df_to_features(df_test_clean, keyword_dict, text_col="text_clean", has_label=False)
#print(X_test.shape)


Model

In [48]:
def fit_and_eval(model, X_train, y_train):
    # fit
    model.fit(X_train, y_train)

    # predict on training data (baseline check)
    y_hat = model.predict(X_train)

    # metrics 
    metrics = {
        "train_accuracy": float(accuracy_score(y_train, y_hat)),
        "train_confusion_matrix": confusion_matrix(y_train, y_hat)
    }
    return model, metrics

In [49]:
results = []

Logistic Regression

In [50]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=4000,
    class_weight="balanced",
    random_state=42
)

model, metrics = fit_and_eval(model, X, y)
results.append({"model_name": "LogisticRegression", **metrics})


##BernoulliNB

In [51]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model, metrics = fit_and_eval(model, X, y)
results.append({"model_name": "BernoulliNB", **metrics})


##Random Forest

In [52]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=10,
    max_depth=None,
    min_samples_split=2,
    random_state=0
)
model, metrics = fit_and_eval(model, X, y)

results.append({"model_name": "Random Forest", **metrics})


Linear SVM(faster)

In [53]:
from sklearn.svm import LinearSVC

model = LinearSVC(random_state=42)
model, metrics = fit_and_eval(model, X, y)

results.append({"model_name": "Linear SVM", **metrics})


SVM

In [54]:
from sklearn.svm import SVC

model = SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42)
model, metrics = fit_and_eval(model, X, y)

results.append({"model_name": "SVM", **metrics})

XGBoost

In [55]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

model, metrics = fit_and_eval(model, X, y)
results.append({"model_name": "XGB", **metrics})


In [56]:
pd.DataFrame(results)[["model_name", "train_accuracy"]]

Unnamed: 0,model_name,train_accuracy
0,LogisticRegression,0.882467
1,BernoulliNB,0.845667
2,Random Forest,0.945067
3,Linear SVM,0.885733
4,SVM,0.925
5,XGB,0.893


Generate submission file

In [57]:
import pandas as pd

# choose model
#model = BernoulliNB()
#model = LogisticRegression(max_iter=4000, class_weight="balanced", random_state=42)
model = RandomForestClassifier(n_estimators=200, random_state=42)
#model = LinearSVC(random_state=42)
#model = SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42)
'''
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)
'''
# train features
X_train, y_train = df_to_features(df_clean, keyword_dict, text_col="text_clean", has_label=True)

# test features
df_test = pd.read_csv("hate_speech_test.csv") # might need to update to test.csv
df_test_clean = preprocess_dataframe(df_test, cfg)
X_test = df_to_features(df_test_clean, keyword_dict, text_col="text_clean", has_label=False)

# fit + predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# submission (must be columns: id, label)
predictions = pd.DataFrame({
    "id": df_test["id"].values,
    "label": y_pred.astype(int)
})

#Kaggle rule: index = FALSE
predictions.to_csv("submission.csv", index=False) 