### 🔴 Model Training -
1. Loading the vectorized data
2. Training multiple machine learning classifiers
3. Evaluating each model
4. Saving the best one


In [13]:
# Importing Libraries
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [14]:
# Loading Preprocessed Data
# Load cleaned data
df = pd.read_csv("/Users/ratulmukherjee/Desktop/Hate Speech Detection/data/cleaned_data.csv")

# Load vectorizer
tfidf = joblib.load("/Users/ratulmukherjee/Desktop/Hate Speech Detection/data/tfidf_vectorizer.pkl")

# Transform clean tweets into TF-IDF features
X = tfidf.transform(df['clean_tweet'].fillna(""))
y = df['class']

In [15]:
# Train - Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

### Training Multiple Models -

In [16]:
# 1. Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.834375630421626


In [17]:
# 2. Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8946943715957232


In [18]:
# 3. Support Vector Machine (LinearSVC)
svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.8842041557393585


In [19]:
# 4. Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.8959047811176114


In [20]:
# Model Evaluation
def evaluate_model(name, y_true, y_pred):
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

evaluate_model("Naive Bayes", y_test, y_pred_nb)
evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("SVM", y_test, y_pred_svm)
evaluate_model("Random Forest", y_test, y_pred_rf)


=== Naive Bayes ===
Accuracy: 0.834375630421626
Confusion Matrix:
 [[   3  270   13]
 [   2 3805   31]
 [   0  505  328]]
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.01      0.02       286
           1       0.83      0.99      0.90      3838
           2       0.88      0.39      0.54       833

    accuracy                           0.83      4957
   macro avg       0.77      0.47      0.49      4957
weighted avg       0.83      0.83      0.79      4957


=== Logistic Regression ===
Accuracy: 0.8946943715957232
Confusion Matrix:
 [[  49  214   23]
 [  27 3702  109]
 [   1  148  684]]
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.17      0.27       286
           1       0.91      0.96      0.94      3838
           2       0.84      0.82      0.83       833

    accuracy                           0.89      4957
   macro avg       0.80      0.65      0.68 

In [21]:
# Saving the best model, here - Random Forest
joblib.dump(rf, "/Users/ratulmukherjee/Desktop/Hate Speech Detection/data/best_model_rf.pkl")


['/Users/ratulmukherjee/Desktop/Hate Speech Detection/data/best_model_rf.pkl']