In [None]:
# Install necessary libraries
!pip install transformers tensorflow scikit-learn imbalanced-learn

# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
from google.colab import drive

# Enable GPU usage
print("Num GPUs Available:", len(tf.config.list_physical_devices("GPU")))

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

Num GPUs Available: 1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/ProjectMonkeyPox/Monkeypox Dataset.csv')

# Clean the text data
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#", "", text)  # Remove hashtags
    text = re.sub(r"[^A-Za-z\s]", "", text)  # Remove special characters and numbers
    return text.lower()

df["Cleaned_Text"] = df["Translated Post Description"].apply(clean_text)

# Preprocess text: Tokenization and lemmatization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["Processed_Text"] = df["Cleaned_Text"].apply(preprocess_text)

# Map sentiments to categories
sentiment_mapping = {
    "anger": "Negative",
    "sadness": "Negative",
    "neutral": "Neutral",
    "joy": "Positive"
}
df["Merged_Sentiment"] = df["Sentiment"].map(sentiment_mapping)

# Encode sentiment labels
label_encoder = LabelEncoder()
df["Sentiment_Encoded"] = label_encoder.fit_transform(df["Merged_Sentiment"])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df["Processed_Text"], df["Sentiment_Encoded"], test_size=0.2, stratify=df["Sentiment_Encoded"], random_state=42
)

In [None]:
# Tokenize using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128, return_tensors="tf")
val_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128, return_tensors="tf")

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_test))

# Initialize model
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Set up optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
model.fit(
    train_dataset.shuffle(1000).batch(8),
    epochs=5,
    validation_data=val_dataset.batch(8)
)

# Save the model
save_directory = "/content/drive/MyDrive/ProjectMonkeyPox/Saved_DistilBert"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Load the fine-tuned model for inference
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained(save_directory)
model_fine_tuned = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

# Generate predictions on test set
y_pred = []
y_true = list(y_test)

for text in X_test:
    predict_input = tokenizer_fine_tuned(
        text,
        truncation=True,
        padding=True,
        return_tensors='tf'
    )
    output = model_fine_tuned(predict_input)[0]
    prediction_value = tf.argmax(output, axis=1).numpy()[0]
    y_pred.append(prediction_value)

# Print classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Some layers from the model checkpoint at /content/drive/MyDrive/ProjectMonkeyPox/Saved_DistilBert were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/ProjectMonkeyPox/Saved_DistilBert and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

Classification Report:
              precision    recall  f1-score   support

    Negative       0.84      0.78      0.81      1038
     Neutral       0.83      0.86      0.84      1200
    Positive       0.84      0.86      0.85      1045

    accuracy                           0.84      3283
   macro avg       0.84      0.83      0.83      3283
weighted avg       0.84      0.84      0.84      3283

