In [5]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import pandas as pd
import numpy as np

In [15]:
def load_dataset(file_path, text_column, label_column):
    df = pd.read_csv(file_path)
    texts = df[text_column].values
    labels = df[label_column].values
    return texts, labels

In [2]:
def split_dataset(texts, labels, test_size=0.3, random_state=42):
    return train_test_split(texts, labels, test_size=test_size, random_state=random_state)

In [3]:
def tokenize_data(tokenizer, texts):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors='tf')
    return encodings['input_ids']

In [4]:
def build_model(pretrained_model_name, num_labels):
    model = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=num_labels)
    return model

In [20]:
def compile_model(model, learning_rate=5e-3):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )
    return model

In [8]:
def train_model(model, train_input_ids, train_labels, val_input_ids, val_labels, epochs=3, batch_size=64):
    history = model.fit(
        train_input_ids, 
        train_labels, 
        validation_data=(val_input_ids, val_labels),
        epochs=epochs,
        batch_size=batch_size
    )
    return history

In [9]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')
    plt.show()

In [10]:
def evaluate_on_train(model, train_input_ids, train_labels, class_names):
    y_pred_train = np.argmax(model.predict(train_input_ids)[0], axis=1)
    print("Training Classification Report:\n", classification_report(train_labels, y_pred_train))
    
    conf_matrix_train = confusion_matrix(train_labels, y_pred_train)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix_train, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Training Confusion Matrix')
    plt.show()

    accuracy_train = accuracy_score(train_labels, y_pred_train)
    print(f"Training Accuracy: {accuracy_train * 100:.2f}%")
    return y_pred_train

In [11]:
def evaluate_model(model, input_ids, true_labels, class_names):
    y_pred = np.argmax(model.predict(input_ids)[0], axis=1)
    print("Classification Report:\n", classification_report(true_labels, y_pred))
    
    conf_matrix = confusion_matrix(true_labels, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    accuracy = accuracy_score(true_labels, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    return y_pred

In [12]:
def predict_on_new_data(saved_model_dir, new_test_file, text_column, output_file, max_length=128):
    model = TFAutoModelForSequenceClassification.from_pretrained(saved_model_dir)
    tokenizer = AutoTokenizer.from_pretrained(saved_model_dir)
    new_test_data = pd.read_csv(new_test_file)
    if text_column not in new_test_data.columns:
        raise ValueError(f"Column '{text_column}' not found in the new test dataset.")
    
    new_test_texts = new_test_data[text_column].values
    new_test_encodings = tokenize_data(tokenizer, new_test_texts, max_length=max_length)
    
    logits = model.predict(new_test_encodings)[0] 
    new_predictions = np.argmax(logits, axis=1)  
    new_test_data['Class (Predicted Label)'] = new_predictions
    new_test_data.to_csv(output_file, index=False)
    print(f"New test predictions saved to '{output_file}'.")

In [13]:
dataset_file = r"D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\data\augmented_dataset\augmented_data_1.1.csv"
#output_predictions = "test_predictions.csv"

text_column = "corrected_tweet"
label_column = "class"

In [16]:
texts, labels = load_dataset(dataset_file, text_column, label_column)
X_train, X_test, y_train, y_test = split_dataset(texts, labels)

In [17]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_input_ids = tokenize_data(tokenizer, X_train)
test_input_ids = tokenize_data(tokenizer, X_test)

In [18]:
num_labels = len(set(labels))
model = build_model('bert-base-uncased', num_labels)
model = compile_model(model)





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# model_save_dir = r"D:\epita class notes\semester - 3\action learnign\project repository\Hate_speech_detection_using_data_augmentation\Hate_speech_detection_using_data_augmentation\nlp_models\fine_tuned_bert_with_augmented_dataset.h5"
# model.save_pretrained(model_save_dir)
# tokenizer.save_pretrained(model_save_dir)
# print(f"Model saved to '{model_save_dir}'")

In [19]:
history = train_model(model, train_input_ids, y_train, test_input_ids, y_test)
plot_loss(history)

Epoch 1/3


 3/26 [==>...........................] - ETA: 6:13 - loss: 121.0716 - accuracy: 0.4479

KeyboardInterrupt: 

In [None]:
class_names = ["Class 0", "Class 1", "Class 2"]
train_predictions = evaluate_on_train(model, train_input_ids, y_train, class_names)

In [None]:
y_pred = evaluate_model(model, test_input_ids, y_test, class_names)
#save_predictions(X_test, y_test, y_pred, output_predictions)