# Task 1: BERT – Emotion Detection



In [4]:
# Install important libraries
!pip install transformers torch scikit-learn pandas

# Import the required tools
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments




In [5]:
from google.colab import files

# Open file picker to upload from your computer
uploaded = files.upload()


Saving emotion_labels.csv to emotion_labels.csv
Saving emotions-dataset.csv to emotions-dataset.csv


In [6]:
import pandas as pd

# Check first file
df1 = pd.read_csv("emotion_labels.csv")
print("emotion_labels.csv:")
print(df1.head())

# Check second file
df2 = pd.read_csv("emotions-dataset.csv")
print("\nemotions-dataset.csv:")
print(df2.head())



emotion_labels.csv:
   label  emotion
0      0      Joy
1      1  Sadness
2      2  Neutral
3      3    Anger

emotions-dataset.csv:
                                             content  sentiment
0                   not a very good day at the house          1
1  tommcfly i saw you on tues and last niiiighht ...          2
2     i dont even understand the intro to this book           3
3      happy mothers day mommy and grandma haha  ily          0
4  quotoh i got so fucked up last nightquot but u...          3


In [7]:
# Rename columns to match expected names
df2.rename(columns={'content': 'Text', 'sentiment': 'Emotion'}, inplace=True)

# Split into training and testing parts (80% train, 20% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df2['Text'], df2['Emotion'], test_size=0.2, random_state=42
)

# Make numeric labels (model needs numbers, not words)
unique_labels = df2['Emotion'].unique().tolist()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

train_labels = [label2id[label] for label in train_labels]
test_labels  = [label2id[label] for label in test_labels]

In [8]:
# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Turn words into tokens that BERT understands
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=64)
test_encodings  = tokenizer(list(test_texts), truncation=True, padding=True, max_length=64)

# Convert everything into a list of dictionaries for PyTorch Dataset
train_dataset = []
for i in range(len(train_texts)):
    train_dataset.append({
        'input_ids': torch.tensor(train_encodings['input_ids'][i]),
        'attention_mask': torch.tensor(train_encodings['attention_mask'][i]),
        'labels': torch.tensor(train_labels[i])
    })

test_dataset = []
for i in range(len(test_texts)):
    test_dataset.append({
        'input_ids': torch.tensor(test_encodings['input_ids'][i]),
        'attention_mask': torch.tensor(test_encodings['attention_mask'][i]),
        'labels': torch.tensor(test_labels[i])
    })

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(unique_labels)
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Load your dataset
df = pd.read_csv("emotions-dataset.csv")

# --- Fix column names automatically ---
columns = [c.lower() for c in df.columns]
df.columns = columns

if 'text' in df.columns and 'emotion' in df.columns:
    df = df.rename(columns={'text': 'Text', 'emotion': 'Emotion'})
elif 'content' in df.columns and 'sentiment' in df.columns:
    df = df.rename(columns={'content': 'Text', 'sentiment': 'Emotion'})
else:
    print("Please check your column names:", df.columns)

# --- Clean text ---
df['Text'] = df['Text'].astype(str).str.lower()

# --- Label encoding ---
label2id = {'anger': 0, 'joy': 1, 'sadness': 2, 'neutral': 3}
id2label = {v: k for k, v in label2id.items()}

# --- Split dataset ---
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'], df['Emotion'], test_size=0.2, random_state=42
)

# --- Tokenization ---
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=64)
test_encodings  = tokenizer(list(test_texts), truncation=True, padding=True, max_length=64)

print(" Dataset and tokenization ready!")
print("Train samples:", len(train_texts))
print("Test samples:", len(test_texts))


 Dataset and tokenization ready!
Train samples: 17640
Test samples: 4410


In [1]:
# Import libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load dataset
df = pd.read_csv("emotions-dataset.csv")
# Reduce dataset size for faster training
df = df.sample(3000, random_state=42)  # you can change 3000 → 2000 or 1000


# Fix column names automatically
df.columns = [c.lower() for c in df.columns]
if "text" in df.columns and "emotion" in df.columns:
    df.rename(columns={"text": "Text", "emotion": "Emotion"}, inplace=True)
elif "content" in df.columns and "sentiment" in df.columns:
    df.rename(columns={"content": "Text", "sentiment": "Emotion"}, inplace=True)

# Convert text and labels
df["Text"] = df["Text"].astype(str).str.lower()
label2id = {"anger": 0, "joy": 1, "sadness": 2, "neutral": 3}
id2label = {v: k for k, v in label2id.items()}
df["Emotion"] = df["Emotion"].apply(lambda x: label2id.get(str(x).lower(), 3))

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["Text"], df["Emotion"], test_size=0.2, random_state=42
)

# Tokenize text
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_enc = tokenizer(list(train_texts), truncation=True, padding=True, max_length=64)
test_enc = tokenizer(list(test_texts), truncation=True, padding=True, max_length=64)

# Dataset class
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, enc, labels):
        self.enc, self.labels = enc, labels
    def __getitem__(self, i):
        item = {k: torch.tensor(v[i]) for k, v in self.enc.items()}
        item["labels"] = torch.tensor(self.labels[i])
        return item
    def __len__(self):
        return len(self.labels)

train_ds = EmotionDataset(train_enc, list(train_labels))
test_ds = EmotionDataset(test_enc, list(test_labels))

# Load BERT
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Training setup (batch = 1)
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy="epoch", # Changed from evaluation_strategy
)

# Metrics
def metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds, average="weighted")}

trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=test_ds, compute_metrics=metrics)
trainer.train()

# Evaluate
print(trainer.evaluate())

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example predictions
samples = ["I am feeling great today!", "This is so frustrating!", "I am sad."]

for text in samples:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    print(f"{text} → {id2label[pred]}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mf247812[0m ([33mf247812-national-university-of-computing-an-emerging-sci[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,1.1e-05,1.0,1.0


{'eval_loss': 1.1399124559829943e-05, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 5.727, 'eval_samples_per_second': 104.767, 'eval_steps_per_second': 104.767, 'epoch': 1.0}
I am feeling great today! → neutral
This is so frustrating! → neutral
I am sad. → neutral


In [4]:
results = trainer.evaluate()
print("Accuracy:", results['eval_accuracy'])


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.0,5e-06,1.0,1.0


Accuracy: 1.0


In [5]:
import torch

# Make sure model is on correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define your correct label mapping
label2id = {'anger': 0, 'joy': 1, 'sadness': 2, 'neutral': 3}
id2label = {v: k for k, v in label2id.items()}

# Input text
text = "I am feeling very happy today!"

# Tokenize input and send to device
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)

# Get model output
outputs = model(**inputs)

# Calculate probabilities
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

# Get predicted label and confidence
pred = torch.argmax(probs).item()
confidence = probs[0][pred].item() * 100

# Print emotion name and confidence
print(f"Predicted emotion: {id2label[pred]} ({confidence:.2f}% confident)")


Predicted emotion: neutral (100.00% confident)


In [6]:
model.save_pretrained("bert_emotion_model")
tokenizer.save_pretrained("bert_emotion_model")


('bert_emotion_model/tokenizer_config.json',
 'bert_emotion_model/special_tokens_map.json',
 'bert_emotion_model/vocab.txt',
 'bert_emotion_model/added_tokens.json')

In [7]:
%%writefile app.py
import streamlit as st
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load your fine-tuned model
model_path = "bert_emotion_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define label mapping (must match your training)
label2id = {'anger': 0, 'joy': 1, 'sadness': 2, 'neutral': 3}
id2label = {v: k for k, v in label2id.items()}

# Streamlit app interface
st.title("Emotion Detection App")
st.write("Type any sentence and find out which emotion it expresses!")

text = st.text_area("Enter your text here:")

if st.button("Predict Emotion"):
    if text.strip() == "":
        st.warning("Please enter some text.")
    else:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs).item()
        confidence = probs[0][pred].item() * 100
        emotion = id2label[pred]
        st.success(f"Predicted emotion: {emotion} ({confidence:.2f}% confident)")


Overwriting app.py


In [8]:
%%writefile requirements.txt
streamlit
torch
transformers


Overwriting requirements.txt


In [9]:
!pip install huggingface_hub
from huggingface_hub import login
login()




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
model_path = "bert_emotion_model"
model_path = "your-username/bert-emotion-model"

