# 📦 Importing all the required libraries

In [None]:
!pip install transformers datasets scikit-learn joblib --quiet

import pandas as pd
import numpy as np
import joblib
import os
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

# ✅ Load and prepare data from CSV
df = pd.read_csv("data/cleaned_emotion_data.csv")

## 🛠️ Manual addition of training data(to improve the accuracy of prediction)

In [None]:
# ✅ Add new examples using label indexes
additional_data = pd.DataFrame({
    "cleaned_text": [
        # Anger (3)
        "I will ruin you",
        "You won't escape this",
        "I'm not happy about this",
        "I hate you",
        "I dislike you",
        "You're making me furious",
        "I can't stand this anymore",
        "You make me so angry",
        "This makes my blood boil",
        "You're the worst person I’ve met",
        "I absolutely hate this",
        "I'm holding a grudge I can't let go",
        "Every time I see you, I boil inside",
        "I'm frustrated beyond words",
        "I’m deeply disappointed in you",
        "Your behavior is unacceptable",

        # Fear (4)
        "I'm scared of what's coming",
        "I can't sleep, I'm terrified",
        "What if I fail?",
        "I don’t feel safe anymore",
        "I keep worrying about the future",
        "I'm anxious something bad might happen",
        "This gives me chills",
        "I’m scared to even speak up",
        "I dread what comes next",
        "I feel paralyzed by fear",

        # Sadness (0)
        "You betrayed me",
        "This makes me feel awful",
        "This isn't working out",
        "I feel so empty",
        "I just want to cry",
        "Nothing feels worth it anymore",
        "It hurts to even remember",
        "I feel like I’ve lost everything",
        "I don’t think I’ll ever feel better",
        "Why does everything always fall apart?",

        # Love (2)
        "I love spending time with my family",
        "I adore my partner so much",
        "I feel a deep affection for my friends",
        "This is something I truly cherish",
        "My heart is full of love today",
        "I have a strong emotional bond with my dog",
        "I admire the way she smiles",
        "I feel romantic towards him",
        "I have genuine love for this city",
        "I appreciate all that you do",
        "You mean so much to me",
        "I feel connected to everything around me",
        "He's the love of my life",
        "I miss the way we used to talk every day",
        "I feel warmth whenever I think of you",
        "I can’t stop thinking about her",
        "I care for him deeply",
        "I treasure our moments together",
        "I feel so lucky to have you in my life",
        "This brings me so much joy and affection",
        "I feel emotionally attached to this place",
        "I hold him close to my heart",
        "I feel so loved and appreciated",
        "I feel romantic when we dance together",
        "This gift means the world to me",
        "I feel butterflies when I see him",
        "You’re everything to me",
        "I want to hold you forever",
        "I’m grateful to be loved by you",
        "I cherish the way you laugh"
    ],
    "label": [
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3,  # anger → total 16
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,     # fear → total 10
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,     # sadness → total 10
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2      # love → total 30
    ]
})



df = pd.concat([df, additional_data], ignore_index=True)

## 📂 Loading the cleaning and labels field

In [None]:
texts = df["cleaned_text"].astype(str).tolist()
labels = df["label"].tolist()

## 🏷️ label encoding 

In [None]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

os.makedirs("src", exist_ok=True)
joblib.dump(label_encoder, "src/label_encoder.pkl")

# 🧩 Tokenizing the data

In [None]:
dataset = Dataset.from_dict({"text": texts, "label": encoded_labels})
dataset = dataset.train_test_split(test_size=0.2)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize, batched=True)

## ➡️ Converting to tensor

In [None]:
def to_tf_dataset(dataset):
    x = {
        "input_ids": np.array(dataset["input_ids"]),
        "attention_mask": np.array(dataset["attention_mask"]),
    }
    y = np.array(dataset["label"])
    return tf.data.Dataset.from_tensor_slices((x, y))

train_dataset = to_tf_dataset(tokenized_datasets["train"]).shuffle(1000).batch(16)
val_dataset = to_tf_dataset(tokenized_datasets["test"]).batch(32)

# 🤖 Compiling and training the model 

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_encoder.classes_)
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# ✅ Increased epochs for better learning
model.fit(train_dataset, validation_data=val_dataset, epochs=7)

## 💾 saving the model and tokenizer

In [None]:
model.save_pretrained("bert_emotion_model")
tokenizer.save_pretrained("bert_emotion_model")

print("✅ Model and tokenizer saved to bert_emotion_model/")


# 🧪 prediction and evaluation

In [None]:
y_true = np.array(tokenized_datasets["test"]["label"])
pred_logits = model.predict(val_dataset).logits
y_pred = np.argmax(pred_logits, axis=1)

acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print(f"\n📊 Final Evaluation:\nAccuracy: {acc:.4f}\nF1 Score: {f1:.4f}\nPrecision: {precision:.4f}\nRecall: {recall:.4f}")

print("\n📌 Per-Emotion Classification Report:\n")
target_names = [str(label) for label in label_encoder.inverse_transform(sorted(np.unique(y_true)))]
print(classification_report(y_true, y_pred, target_names=target_names))

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# ▶️ Sample run on the notebook itself

In [None]:
from transformers import pipeline, TFDistilBertForSequenceClassification, DistilBertTokenizerFast
import joblib
import tensorflow as tf

# ✅ Load local model and tokenizer
model_path = "bert_emotion_model"
model = TFDistilBertForSequenceClassification.from_pretrained(model_path, local_files_only=True)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path, local_files_only=True)

# ✅ Define fixed emotion mapping
index_to_emotion = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# ✅ Define a function to predict emotion
def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=128)
    logits = model(inputs).logits
    prediction = tf.argmax(logits, axis=1).numpy()[0]
    return index_to_emotion[prediction]

# ✅ Try with user input
while True:
    user_input = input("Enter your text (or type 'exit' to stop): ")
    if user_input.lower() == "exit":
        break
    emotion = predict_emotion(user_input)
    print(f"🔍 Predicted Emotion: {emotion}\n")





# saving model and tokenizer

In [None]:
import os
import joblib

# Make sure directory exists
os.makedirs("/mnt/data/bert_emotion_model", exist_ok=True)

# Save model and tokenizer to /mnt/data so I can access it
model.save_pretrained("/mnt/data/bert_emotion_model")
tokenizer.save_pretrained("/mnt/data/bert_emotion_model")

# Save label encoder as well
joblib.dump(label_encoder, "/mnt/data/src/label_encoder.pkl")

print("✅ Model, tokenizer, and label encoder saved to /mnt/data/src/")


In [None]:
# Zip the model folder
!zip -r bert_emotion_model.zip /mnt/data/bert_emotion_model

# Download it
from google.colab import files
files.download("bert_emotion_model.zip")


In [None]:
# Download label encoder separately
files.download("/mnt/data/src/label_encoder.pkl")
