In [24]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

vectorizer = CountVectorizer()

In [25]:
def load_data():
    dataset = load_dataset("daily_dialog")
    all_sentences = []
    all_emotions = []

    for example in dataset["train"]:
        dialogues = example["dialog"]
        emotions = example["emotion"]

        for sentence, emotion in zip(dialogues, emotions):
            all_sentences.append(sentence)
            all_emotions.append(emotion)

    return all_sentences, all_emotions


In [26]:
def split_data(sentences, emotion_labels, train_ratio=0.8, seed=5):
    np.random.seed(seed)

    n_samples = len(sentences)
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    split_point = int(train_ratio * n_samples)

    train_indices = indices[:split_point]
    test_indices = indices[split_point:]

    train_sentences = [sentences[i] for i in train_indices]
    train_emotions = [emotion_labels[i] for i in train_indices]
    test_sentences = [sentences[i] for i in test_indices]
    test_emotions = [emotion_labels[i] for i in test_indices]


    print(f"Number of training sentences: {len(train_sentences)}")
    print(f"Number of test sentences: {len(test_sentences)}")
    print()
    return train_sentences, train_emotions, test_sentences, test_emotions


In [27]:
def extract_bow_features(trainInputs, testInputs):
    print(f"Number of train sentences: {len(train_sentences)}")
    print(f"Number of test sentences: {len(test_sentences)}")

    train_features = vectorizer.fit_transform(train_sentences)

    test_features = vectorizer.transform(test_sentences)

    print(f"Vocabulary size: {len(vectorizer.vocabulary_)} words")
    print(f"Training features shape: {train_features.shape}")
    print(f"Testing features shape: {test_features.shape}")
    print()
    return train_features, test_features

In [28]:
def train_model(train_features, train_emotions):
    X = torch.FloatTensor(train_features)
    y = torch.LongTensor(train_emotions)

    input_size = X.shape[1]
    hidden_size = 32
    num_classes = len(torch.unique(y))
    model = nn.Sequential(
        nn.Linear(input_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, num_classes)
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    epochs = 10
    for epoch in range(epochs):
        outputs = model(X)
        loss = criterion(outputs, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return model

def test_model(model, test_features, test_emotions):
    X = torch.FloatTensor(test_features.toarray())

    with torch.no_grad():
        outputs = model(X)
        _, predicted = torch.max(outputs.data, 1)

    predictions = predicted.numpy()
    accuracy = accuracy_score(test_emotions, predictions)
    print(f"Test accuracy: {accuracy:.4f}")

In [29]:
if __name__ == '__main__':
    sentences, labels = load_data()
    train_sentences, train_emotions, test_sentences, test_emotions = split_data(sentences, labels)
    print()
    train_features, test_features = extract_bow_features(train_sentences, test_sentences)

    clf = train_model(train_features.toarray(), train_emotions)
    test_model(clf, test_features, test_emotions)



Number of training sentences: 69736
Number of test sentences: 17434


Number of train sentences: 69736
Number of test sentences: 17434
Vocabulary size: 16428 words
Training features shape: (69736, 16428)
Testing features shape: (17434, 16428)

Test accuracy: 0.8284
