In [15]:
# Basic libraries
import pandas as pd
import numpy as np

# For text preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# For BERT
from transformers import BertTokenizer, TFBertForSequenceClassification, TFAlbertForSequenceClassification, AlbertTokenizer, AlbertModel
import tensorflow as tf


In [16]:
# Load your dataset (ensure it's in the correct format: CSV with 'text' and 'label' columns)
df = pd.read_csv('./data/hateXplain.csv')
df = df[0:1000]

# Check for missing values
print(df.isnull().sum())

# Basic exploration of the dataset
print(df['label'].value_counts())
print(df.head())




post_id         0
annotator_id    0
label           0
target          1
post_tokens     0
dtype: int64
normal        515
hatespeech    296
offensive     189
Name: label, dtype: int64
                       post_id  annotator_id   label target  \
0  1179055004553900032_twitter             1  normal   None   
1  1179055004553900032_twitter             2  normal   None   
2  1179055004553900032_twitter             3  normal   None   
3  1179063826874032128_twitter             1  normal   None   
4  1179063826874032128_twitter             2  normal   None   

                                         post_tokens  
0  i dont think im getting my baby them white 9 h...  
1  i dont think im getting my baby them white 9 h...  
2  i dont think im getting my baby them white 9 h...  
3  we cannot continue calling ourselves feminists...  
4  we cannot continue calling ourselves feminists...  


In [17]:
# Split dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(df['post_tokens'], df['label'], test_size=0.2, random_state=42)

# For traditional models: use TF-IDF to convert text into feature vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 features for performance
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [27]:
# Initialize and train the Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Make predictions
nb_predictions = nb_model.predict(X_test_tfidf)

# Evaluate model performance
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print(classification_report(y_test, nb_predictions))


Naive Bayes Accuracy: 0.745
              precision    recall  f1-score   support

           0       0.71      0.96      0.81        93
           1       0.79      0.72      0.75        67
           2       0.92      0.30      0.45        40

    accuracy                           0.74       200
   macro avg       0.81      0.66      0.67       200
weighted avg       0.78      0.74      0.72       200



In [28]:
# Initialize and train the Support Vector Machine model
svm_model = SVC(kernel='linear')  # Linear kernel is commonly used for text classification
svm_model.fit(X_train_tfidf, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test_tfidf)

# Evaluate model performance
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))


SVM Accuracy: 0.8
              precision    recall  f1-score   support

           0       0.85      0.85      0.85        93
           1       0.76      0.82      0.79        67
           2       0.74      0.65      0.69        40

    accuracy                           0.80       200
   macro avg       0.79      0.77      0.78       200
weighted avg       0.80      0.80      0.80       200



In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


# # Map the labels (assuming 'normal' -> 0, 'hatespeech' -> 1, 'offensive' -> 2)
# label_mapping = {'normal': 0, 'hatespeech': 1, 'offensive': 2}
# df['label'] = df['label'].map(label_mapping)

# # Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df['post_tokens'], df['label'], test_size=0.2, random_state=42)

# Initialize the BERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')

# Tokenize the text data for BERT
def tokenize_text(text, max_length=128):
    return tokenizer(text.tolist(), 
                     padding='max_length', 
                     truncation=True, 
                     max_length=max_length, 
                     return_tensors='tf')

# Tokenize the training and testing sets
train_encodings = tokenize_text(X_train)
test_encodings = tokenize_text(X_test)


In [23]:
import tensorflow as tf

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train.values)).batch(128)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test.values)).batch(128)


from transformers import TFBertForSequenceClassification, create_optimizer

# Load pre-trained BERT model for sequence classification (with 3 output labels)
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v1', num_labels=3)

# Number of training steps
batch_size = 128
num_train_steps = len(X_train) // batch_size * 1  # Assuming 3 epochs

# Create the AdamW optimizer
optimizer, lr_schedule = create_optimizer(init_lr=2e-5, num_train_steps=num_train_steps, num_warmup_steps=0)

# Compile the model using AdamW optimizer
model.compile(optimizer=optimizer, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])


All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Train the BERT model
model.fit(train_dataset, 
          validation_data=test_dataset, 
          epochs=1)  # You can adjust the number of epochs as needed
 



<tf_keras.src.callbacks.History at 0x248120dd660>

In [25]:
# Evaluate the BERT model on the test dataset
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {accuracy:.4f}")



Test Accuracy: 0.4750


In [29]:
# Summary of all model performances
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
print("BERT Test Accuracy:", accuracy)


Naive Bayes Accuracy: 0.745
SVM Accuracy: 0.8
BERT Test Accuracy: 0.4749999940395355
