In [None]:
!pip install transformers datasets
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report
from collections import Counter
from datasets import Dataset  # Import Dataset for batching

# Download NLTK data if not available
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load Models
print("Loading models...")
distilbert_path = "/content/drive/MyDrive/ProjectMonkeyPox/Saved_DistilBert"
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained(distilbert_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(distilbert_path)
cnn_model = load_model("/content/drive/MyDrive/ProjectMonkeyPox/Saved_CNN/cnn_model.h5")
logreg_model = joblib.load("/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/log_reg.pkl")
rf_model = joblib.load("/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/rf.pkl")
xgb_model = joblib.load("/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/xgb.pkl")
tfidf_vectorizer = joblib.load("/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/vectorizer.pkl")

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Function to preprocess entire dataset at once
def preprocess_dataset(texts):
    return [preprocess_text(text) for text in texts]

# Function to get model predictions in batch
def get_model_hard_predictions(texts):
    print("Preprocessing text...")
    cleaned_texts = preprocess_dataset(texts)

    print("Preparing inputs...")
    bert_inputs = tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors="tf")
    tfidf_inputs = tfidf_vectorizer.transform(cleaned_texts).toarray()

    print("Predicting with models...")
    # DistilBERT predictions
    bert_logits = distilbert_model(bert_inputs["input_ids"]).logits
    bert_classes = np.argmax(bert_logits.numpy(), axis=1)

    # CNN predictions
    cnn_classes = np.argmax(cnn_model.predict(tfidf_inputs, verbose=0), axis=1)

    # Logistic Regression predictions
    logreg_classes = np.argmax(logreg_model.predict_proba(tfidf_inputs), axis=1)

    # Random Forest predictions
    rf_classes = np.argmax(rf_model.predict_proba(tfidf_inputs), axis=1)

    # XGBoost predictions
    xgb_classes = np.argmax(xgb_model.predict_proba(tfidf_inputs), axis=1)

    # Combine predictions for each row
    return np.vstack([bert_classes, cnn_classes, logreg_classes, rf_classes, xgb_classes]).T  # Shape: (num_samples, 5)

# Function to apply majority voting across all predictions
def majority_voting_batch(texts):
    predictions = get_model_hard_predictions(texts)

    print("Applying majority voting...")
    final_classes = [Counter(row).most_common(1)[0][0] for row in predictions]

    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    return [sentiment_labels[class_idx] for class_idx in final_classes]

# Function to evaluate dataset (predicting on only half the dataset with increased batch size)
def evaluate_dataset_majority_voting(dataset_path, batch_size=32):  # Increased batch size to 256
    print("Loading dataset...")
    df = pd.read_csv(dataset_path)

    # Use only half of the dataset
    df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

    # Convert to Hugging Face Dataset for batching
    dataset = Dataset.from_pandas(df)

    # Extract texts
    texts = dataset['Translated Post Description']

    # Get final predictions in batches
    all_predictions = []
    for i in range(0, len(dataset), batch_size):
        batch_texts = texts[i : i + batch_size]  # Get a batch of texts
        batch_predictions = majority_voting_batch(batch_texts)  # Predict on the batch
        all_predictions.extend(batch_predictions)  # Add predictions to the overall list

    df['Predicted_Sentiment'] = all_predictions  # Assign all predictions to the dataframe

    # Map actual sentiments for classification report
    sentiment_mapping = {
        "anger": "Negative",
        "sadness": "Negative",
        "neutral": "Neutral",
        "joy": "Positive"
    }
    df['Actual_Sentiment'] = df['Sentiment'].map(sentiment_mapping)

    # Generate classification report
    print("\nClassification Report:")
    print(classification_report(df['Actual_Sentiment'], df['Predicted_Sentiment'], target_names=['Negative', 'Neutral', 'Positive']))

    return df

Loading models...


All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/ProjectMonkeyPox/Saved_DistilBert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [None]:
# Run evaluation with optimized batch processing
dataset_path = "/content/drive/MyDrive/ProjectMonkeyPox/Monkeypox Dataset.csv"
evaluated_df = evaluate_dataset_majority_voting(dataset_path)  # Increased batch_size to reduce iterations

Loading dataset...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...
Preparing inputs...
Predicting with models...
Applying majority voting...
Preprocessing text...