In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from transformers import BertTokenizer

In [None]:
df_emotion = pd.read_parquet('train-00000-of-00001.parquet')

df_emotion = df_emotion[['text', 'label']]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_emotion['text'], df_emotion['label'], test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        return {key: val.squeeze(0) for key, val in encoding.items()}, torch.tensor(label)

train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256)


In [None]:
class EmotionRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=2, dropout=0.3):
        super(EmotionRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        output = self.fc(self.dropout(hidden[-1]))
        return output

vocab_size = len(tokenizer.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 7


In [None]:
model = EmotionRNN(vocab_size, embedding_dim, hidden_dim, output_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch
        inputs = inputs['input_ids'].to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = inputs['input_ids'].to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, predictions)
    print(f"Validation Accuracy: {acc}")

torch.save(model.state_dict(), "emotion_rnn_model.pth")


Epoch 1, Loss: 1.5893732456637637
Validation Accuracy: 0.33884743648185023
Epoch 2, Loss: 1.5781244699166357
Validation Accuracy: 0.33884743648185023
Epoch 3, Loss: 1.5764349813380794
Validation Accuracy: 0.33884743648185023


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        inputs, labels = batch
        inputs = inputs['input_ids'].to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

acc = accuracy_score(true_labels, predictions)
print(f"Validation Accuracy: {acc}")

precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Validation Precision: {precision}")
print(f"Validation Recall: {recall}")
print(f"Validation F1-Score: {f1}")

report = classification_report(true_labels, predictions)
print("Classification Report:")
print(report)


Validation Accuracy: 0.33884743648185023


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Precision: 0.11481758521032152
Validation Recall: 0.33884743648185023
Validation F1-Score: 0.17151705576257872
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     24504
           1       0.34      1.00      0.51     28247
           2       0.00      0.00      0.00      6853
           3       0.00      0.00      0.00     11339
           4       0.00      0.00      0.00      9376
           5       0.00      0.00      0.00      3043

    accuracy                           0.34     83362
   macro avg       0.06      0.17      0.08     83362
weighted avg       0.11      0.34      0.17     83362



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
model = EmotionRNN(vocab_size, embedding_dim, hidden_dim, output_dim)
model.load_state_dict(torch.load("emotion_rnn_model.pth"))
model.to(device)
model.eval()

def predict_emotion(text):
    inputs = tokenizer(text, max_length=128, padding="max_length", truncation=True, return_tensors="pt")

    inputs = inputs['input_ids'].to(device)

    with torch.no_grad():
        outputs = model(inputs)
        predicted_label = torch.argmax(outputs, dim=1).cpu().numpy()[0]

    emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    predicted_emotion = emotions[predicted_label]
    return predicted_emotion

input_text = "i am happy to life"
predicted_emotion = predict_emotion(input_text)

print(f"Predicted Emotion: {predicted_emotion}")


Predicted Emotion: joy


  model.load_state_dict(torch.load("emotion_rnn_model.pth"))
