In [None]:
!pip install imblearn transformers nltk scikit-learn xgboost tensorflow

import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

# Make sure the necessary NLTK data files are downloaded
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')
nltk.download('punkt_tab')

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/ProjectMonkeyPox/Monkeypox Dataset.csv')

# Check dataset structure
print(df.head())

# Original sentiment distribution
print(df['Sentiment'].value_counts())



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


       Post ID                                   Post description        Date  \
0  CgXDOaQDvGm  “I have decided that the global #monkeypox out...  07/23/2022   
1  CgXpRmMIdzG  In light of the evolving monkeypox outbreak wi...  07/23/2022   
2  CgXaFGDsevq  If you've been hearing about monkeypox and wan...  07/23/2022   
3  CgXGNrmLwoL  Monkeypox is a rare disease caused by infectio...  07/23/2022   
4  CgXTqcjOQD-  For today's @newyorkermag dispatch. \n'The Ago...  07/23/2022   

  Language                        Translated Post Description Sentiment  \
0  English  “I have decided that the global #monkeypox out...   neutral   
1  English  In light of the evolving monkeypox outbreak wi...   neutral   
2  English  If you've been hearing about monkeypox and wan...   neutral   
3  English  Monkeypox is a rare disease caused by infectio...   neutral   
4  English  For today's @newyorkermag dispatch. \n'The Ago...   sadness   

       Hate           Stress or Anxiety  
0  Not Hate     Stre

In [None]:
# Clean and preprocess text
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#", "", text)  # Remove hashtags
    text = re.sub(r"[^A-Za-z\s]", "", text)  # Remove special characters and numbers
    return text.lower()

df["Cleaned_Text"] = df["Translated Post Description"].apply(clean_text)

lemmatizer = nltk.WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words("english"))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["Processed_Text"] = df["Cleaned_Text"].apply(preprocess_text)

sentiment_mapping = {
    "anger": "Negative",
    "sadness": "Negative",
    "neutral": "Neutral",
    "joy": "Positive"
}

df["Merged_Sentiment"] = df["Sentiment"].map(sentiment_mapping)

label_encoder = LabelEncoder()
df["Sentiment_Encoded"] = label_encoder.fit_transform(df["Merged_Sentiment"])

# Prepare train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["Processed_Text"], df["Sentiment_Encoded"], test_size=0.2, stratify=df["Sentiment_Encoded"], random_state=42
)

In [None]:
# Tokenization and Padding for CNN
max_words = 15000  # Vocabulary size
max_len = 300     # Max length of sequences

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding="post", truncating="post")
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding="post", truncating="post")

# CNN Model
embedding_dim = 128

# Define model directory on Google Drive
model_dir = '/content/drive/MyDrive/ProjectMonkeyPox/Saved_CNN'
os.makedirs(model_dir, exist_ok=True)

cnn_model_path = os.path.join(model_dir, "cnn_model.h5")

# Check if model is already saved
if os.path.exists(cnn_model_path):
    print("Loading pre-trained CNN model...")
    model = load_model(cnn_model_path)
else:
    print("Training CNN model...")
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
        Conv1D(filters=128, kernel_size=5, activation="relu"),
        GlobalMaxPooling1D(),
        Dense(64, activation="relu"),
        Dropout(0.5),
        Dense(len(label_encoder.classes_), activation="softmax")  # Output layer for classification
    ])

    # Compile Model
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

    # Train Model
    early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
    model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=20, batch_size=32, callbacks=[early_stopping])

    # Save the trained model
    model.save(cnn_model_path)
    print(f"CNN model saved to {cnn_model_path}")

# Predictions
y_pred_cnn = model.predict(X_test_padded)
y_pred_cnn = np.argmax(y_pred_cnn, axis=1)

# Classification Report
from sklearn.metrics import classification_report
print("CNN Classification Report")
print(classification_report(y_test, y_pred_cnn))

Training CNN model...




Epoch 1/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 18ms/step - accuracy: 0.5570 - loss: 0.8970 - val_accuracy: 0.7810 - val_loss: 0.5318
Epoch 2/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8614 - loss: 0.3796 - val_accuracy: 0.8099 - val_loss: 0.4892
Epoch 3/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9509 - loss: 0.1625 - val_accuracy: 0.8145 - val_loss: 0.5648
Epoch 4/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9754 - loss: 0.0881 - val_accuracy: 0.8203 - val_loss: 0.6537
Epoch 5/20
[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9903 - loss: 0.0425 - val_accuracy: 0.8185 - val_loss: 0.7456




CNN model saved to /content/drive/MyDrive/ProjectMonkeyPox/Saved_CNN/cnn_model.h5
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
CNN Classification Report
              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1038
           1       0.82      0.85      0.83      1200
           2       0.85      0.80      0.82      1045

    accuracy                           0.81      3283
   macro avg       0.81      0.81      0.81      3283
weighted avg       0.81      0.81      0.81      3283

