In [None]:
!pip install imblearn transformers nltk scikit-learn xgboost tensorflow

import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, GlobalMaxPooling1D, Conv1D
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import joblib
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')  # Ensure WordNet for lemmatization is downloaded
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/ProjectMonkeyPox/Monkeypox Dataset.csv')

# Check dataset structure
print(df.head())

# Original sentiment distribution
print(df['Sentiment'].value_counts())

# Clean text function
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#", "", text)  # Remove hashtags
    text = re.sub(r"[^A-Za-z\s]", "", text)  # Remove special characters and numbers
    return text.lower()

df["Cleaned_Text"] = df["Translated Post Description"].apply(clean_text)

# Preprocess text with lemmatization and stopwords removal
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["Processed_Text"] = df["Cleaned_Text"].apply(preprocess_text)

# Sentiment mapping
sentiment_mapping = {
    "anger": "Negative",
    "sadness": "Negative",
    "neutral": "Neutral",
    "joy": "Positive"
}

df["Merged_Sentiment"] = df["Sentiment"].map(sentiment_mapping)
label_encoder = LabelEncoder()
df["Sentiment_Encoded"] = label_encoder.fit_transform(df["Merged_Sentiment"])
print(label_encoder.classes_)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df["Processed_Text"], df["Sentiment_Encoded"], test_size=0.2, stratify=df["Sentiment_Encoded"], random_state=42
)

       Post ID                                   Post description        Date  \
0  CgXDOaQDvGm  “I have decided that the global #monkeypox out...  07/23/2022   
1  CgXpRmMIdzG  In light of the evolving monkeypox outbreak wi...  07/23/2022   
2  CgXaFGDsevq  If you've been hearing about monkeypox and wan...  07/23/2022   
3  CgXGNrmLwoL  Monkeypox is a rare disease caused by infectio...  07/23/2022   
4  CgXTqcjOQD-  For today's @newyorkermag dispatch. \n'The Ago...  07/23/2022   

  Language                        Translated Post Description Sentiment  \
0  English  “I have decided that the global #monkeypox out...   neutral   
1  English  In light of the evolving monkeypox outbreak wi...   neutral   
2  English  If you've been hearing about monkeypox and wan...   neutral   
3  English  Monkeypox is a rare disease caused by infectio...   neutral   
4  English  For today's @newyorkermag dispatch. \n'The Ago...   sadness   

       Hate           Stress or Anxiety  
0  Not Hate     Stre

In [None]:
# Use n-grams with CountVectorizer (1 to 3 grams) for crazy feature expansion
vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=15000)
X_train_ngram = vectorizer.fit_transform(X_train).toarray()
X_test_ngram = vectorizer.transform(X_test).toarray()

# Handle imbalanced data with SMOTE
smote = SMOTE(random_state=42)
X_train_ngram, y_train = smote.fit_resample(X_train_ngram, y_train)

# Define multiple models for ensemble learning
log_reg = LogisticRegression(max_iter=500, class_weight="balanced", solver="liblinear")
nb = MultinomialNB()
rf = RandomForestClassifier(n_estimators=500, class_weight="balanced", random_state=42)
xgb = XGBClassifier(n_estimators=500, random_state=42)

# Fit each model
log_reg.fit(X_train_ngram, y_train)
nb.fit(X_train_ngram, y_train)
rf.fit(X_train_ngram, y_train)
xgb.fit(X_train_ngram, y_train)

# Predictions
y_pred_log = log_reg.predict(X_test_ngram)
y_pred_nb = nb.predict(X_test_ngram)
y_pred_rf = rf.predict(X_test_ngram)
y_pred_xgb = xgb.predict(X_test_ngram)

# Evaluate models
print("Logistic Regression")
print(classification_report(y_test, y_pred_log))
print("Naive Bayes")
print(classification_report(y_test, y_pred_nb))
print("Random Forest")
print(classification_report(y_test, y_pred_rf))
print("XGBoost")
print(classification_report(y_test, y_pred_xgb))

Logistic Regression
              precision    recall  f1-score   support

           0       0.75      0.75      0.75      1038
           1       0.80      0.83      0.82      1200
           2       0.83      0.80      0.81      1045

    accuracy                           0.80      3283
   macro avg       0.80      0.79      0.79      3283
weighted avg       0.80      0.80      0.80      3283

Naive Bayes
              precision    recall  f1-score   support

           0       0.79      0.33      0.47      1038
           1       0.54      0.95      0.69      1200
           2       0.73      0.51      0.60      1045

    accuracy                           0.62      3283
   macro avg       0.69      0.60      0.59      3283
weighted avg       0.68      0.62      0.59      3283

Random Forest
              precision    recall  f1-score   support

           0       0.74      0.73      0.73      1038
           1       0.80      0.79      0.80      1200
           2       0.79      

In [None]:
import os
os.makedirs('/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels', exist_ok=True)

# Save models and vectorizer
joblib.dump(log_reg, '/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/log_reg.pkl')
joblib.dump(nb, '/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/nb.pkl')
joblib.dump(rf, '/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/rf.pkl')
joblib.dump(xgb, '/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/xgb.pkl')
joblib.dump(vectorizer, '/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/vectorizer.pkl')

['/content/drive/MyDrive/ProjectMonkeyPox/TraditionalModels/vectorizer.pkl']

Ensemble Learning ::

In [None]:
import joblib
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
import pandas as pd # Import pandas here
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define paths
model_path = "/content/drive/MyDrive/ProjectMonkeyPox/Saved_TraditionalModels/"

# Load models
log_reg = joblib.load(model_path + "log_reg.pkl")
nb = joblib.load(model_path + "nb.pkl")
rf = joblib.load(model_path + "rf.pkl")
xgb = joblib.load(model_path + "xgb.pkl")
vectorizer = joblib.load(model_path + "vectorizer.pkl")

In [None]:
# Load dataset again
df = pd.read_csv('/content/drive/MyDrive/ProjectMonkeyPox/Monkeypox Dataset.csv')

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Ensure preprocessing is applied
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^A-Za-z\s]", "", text)
    return text.lower()

df["Cleaned_Text"] = df["Translated Post Description"].apply(clean_text)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["Processed_Text"] = df["Cleaned_Text"].apply(preprocess_text)

# Sentiment mapping
sentiment_mapping = {
    "anger": "Negative",
    "sadness": "Negative",
    "neutral": "Neutral",
    "joy": "Positive"
}

df["Merged_Sentiment"] = df["Sentiment"].map(sentiment_mapping)

# Import LabelEncoder  # This line is added to import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
df["Sentiment_Encoded"] = label_encoder.fit_transform(df["Merged_Sentiment"])


from sklearn.model_selection import train_test_split
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df["Processed_Text"], df["Sentiment_Encoded"], test_size=0.2, stratify=df["Sentiment_Encoded"], random_state=42
)

# Transform text using loaded vectorizer
X_train_ngram = vectorizer.transform(X_train).toarray()
X_test_ngram = vectorizer.transform(X_test).toarray()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Load ensemble learning (Soft Voting) without calling `.fit()`
ensemble = VotingClassifier(
    estimators=[
        ('log_reg', log_reg),
        ('nb', nb),
        ('rf', rf),
        ('xgb', xgb)
    ], voting='soft'
)


y_pred_ensemble = np.mean(
    np.stack([model.predict_proba(X_test_ngram) for name, model in ensemble.estimators], axis=0),
    axis=0
).argmax(axis=1)

# Evaluate ensemble model
print("Ensemble Voting")
print(classification_report(y_test, y_pred_ensemble))

Ensemble Voting
              precision    recall  f1-score   support

           0       0.83      0.68      0.75      1038
           1       0.75      0.92      0.82      1200
           2       0.85      0.79      0.82      1045

    accuracy                           0.80      3283
   macro avg       0.81      0.79      0.80      3283
weighted avg       0.81      0.80      0.80      3283

