In [None]:
# Basic libraries
import pandas as pd
import numpy as np

# For text preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# For BERT
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf


In [None]:
# Load your dataset (ensure it's in the correct format: CSV with 'text' and 'label' columns)
df = pd.read_csv('./data/hateXplain.csv')

# Check for missing values
print(df.isnull().sum())

# Basic exploration of the dataset
print(df['label'].value_counts())
print(df.head())


In [4]:
# Split dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(df['post_tokens'], df['label'], test_size=0.2, random_state=42)

# For traditional models: use TF-IDF to convert text into feature vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 features for performance
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Initialize and train the Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Make predictions
nb_predictions = nb_model.predict(X_test_tfidf)

# Evaluate model performance
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print(classification_report(y_test, nb_predictions))


In [None]:
# Initialize and train the Support Vector Machine model
svm_model = SVC(kernel='linear')  # Linear kernel is commonly used for text classification
svm_model.fit(X_train_tfidf, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test_tfidf)

# Evaluate model performance
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))


In [None]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='tf')

# Apply tokenization to both train and test sets
X_train_tokens = [tokenize_text(t) for t in X_train]
X_test_tokens = [tokenize_text(t) for t in X_test]

# Convert labels to numpy arrays for compatibility with TensorFlow
y_train = np.array(y_train)
y_test = np.array(y_test)


In [None]:
# Load pre-trained BERT model for sequence classification
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # num_labels=2 for binary classification

# Compile the model with optimizer, loss, and metrics
bert_model.compile(optimizer=Adam(learning_rate=2e-5),
                   loss=SparseCategoricalCrossentropy(from_logits=True),
                   metrics=['accuracy'])

# Prepare inputs for the model
train_inputs = {'input_ids': np.array([x['input_ids'][0] for x in X_train_tokens]),
                'attention_mask': np.array([x['attention_mask'][0] for x in X_train_tokens])}

test_inputs = {'input_ids': np.array([x['input_ids'][0] for x in X_test_tokens]),
               'attention_mask': np.array([x['attention_mask'][0] for x in X_test_tokens])}

# Train the BERT model
bert_model.fit(train_inputs, y_train, validation_data=(test_inputs, y_test), epochs=3, batch_size=16)


In [None]:
# Evaluate the BERT model
bert_eval = bert_model.evaluate(test_inputs, y_test)
print("BERT Test Accuracy:", bert_eval[1])


In [None]:
# Summary of all model performances
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
print("BERT Test Accuracy:", bert_eval[1])
