In [None]:
# Install required packages
!pip install transformers datasets

import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report
from collections import Counter
from datasets import Dataset  # Import Dataset for batching
from scipy.special import softmax

# Download NLTK data if not available
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Models
print("Loading models...")
distilbert_path = "/content/drive/MyDrive/ProjectMonkeyPox/Saved_DistilBert"
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained(distilbert_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(distilbert_path)
cnn_model = load_model("/content/drive/MyDrive/ProjectMonkeyPox/Saved_CNN/cnn_model.h5")
logreg_model = joblib.load("/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/log_reg.pkl")
rf_model = joblib.load("/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/rf.pkl")
xgb_model = joblib.load("/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/xgb.pkl")
tfidf_vectorizer = joblib.load("/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/vectorizer.pkl")

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Function to preprocess entire dataset at once
def preprocess_dataset(texts):
    return [preprocess_text(text) for text in texts]

# Function to get model predictions in batch (using soft voting)
def get_model_soft_predictions(texts):
    print("Preprocessing text...")
    cleaned_texts = preprocess_dataset(texts)

    print("Preparing inputs...")
    bert_inputs = tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors="tf")
    tfidf_inputs = tfidf_vectorizer.transform(cleaned_texts).toarray()

    print("Predicting with models...")
    # DistilBERT predictions
    bert_logits = distilbert_model(bert_inputs["input_ids"]).logits
    bert_probs = softmax(bert_logits.numpy(), axis=1)  # Convert logits to probabilities

    # CNN predictions
    cnn_probs = softmax(cnn_model.predict(tfidf_inputs, verbose=0), axis=1)

    # Logistic Regression predictions
    logreg_probs = softmax(logreg_model.predict_proba(tfidf_inputs), axis=1)

    # Random Forest predictions
    rf_probs = softmax(rf_model.predict_proba(tfidf_inputs), axis=1)

    # XGBoost predictions
    xgb_probs = softmax(xgb_model.predict_proba(tfidf_inputs), axis=1)

    # Stack the probabilities from each model; shape: (num_samples, num_models, num_classes)
    all_probs = np.stack([bert_probs, cnn_probs, logreg_probs, rf_probs, xgb_probs], axis=1)

    # Average probabilities across all models (axis=1)
    avg_probs = np.mean(all_probs, axis=1)

    # Get predicted class based on highest probability
    final_classes = np.argmax(avg_probs, axis=1)

    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    return [sentiment_labels[class_idx] for class_idx in final_classes]

# Function to apply soft voting across all predictions
def soft_voting_batch(texts):
    predictions = get_model_soft_predictions(texts)
    return predictions

# Function to evaluate dataset using soft voting (processing only half of the dataset with batch size 32)
def evaluate_dataset_soft_voting(dataset_path, batch_size=32):
    print("Loading dataset...")
    df = pd.read_csv(dataset_path)

    # Use only half of the dataset
    df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

    # Convert to Hugging Face Dataset for batching
    dataset = Dataset.from_pandas(df)

    # Extract texts
    texts = dataset['Translated Post Description']

    # Get final predictions in batches
    all_predictions = []
    for i in range(0, len(dataset), batch_size):
        batch_texts = texts[i : i + batch_size]  # Get a batch of texts
        batch_predictions = soft_voting_batch(batch_texts)  # Predict on the batch
        all_predictions.extend(batch_predictions)  # Add predictions to the overall list

    df['Predicted_Sentiment'] = all_predictions  # Assign predictions to the dataframe

    # Map actual sentiments for classification report
    sentiment_mapping = {
        "anger": "Negative",
        "sadness": "Negative",
        "neutral": "Neutral",
        "joy": "Positive"
    }
    df['Actual_Sentiment'] = df['Sentiment'].map(sentiment_mapping)

    # Generate classification report
    print("\nClassification Report:")
    print(classification_report(df['Actual_Sentiment'], df['Predicted_Sentiment'], target_names=['Negative', 'Neutral', 'Positive']))

    return df

# Run evaluation with soft voting and batch size 32
dataset_path = "/content/drive/MyDrive/ProjectMonkeyPox/Monkeypox Dataset.csv"
evaluated_df = evaluate_dataset_soft_voting(dataset_path, batch_size=32)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading models...


Some layers from the model checkpoint at /content/drive/MyDrive/ProjectMonkeyPox/Saved_DistilBert were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/ProjectMonkeyPox/Saved_DistilBert and are newly initialized: ['dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

Loading dataset...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Preparing inputs...
Predicting with models...
Preprocessing text...
Prepari