In [12]:
import pandas as pd

train = pd.read_table(
    "../datasets/CREMA-D/text/train.txt",
    delimiter=";",
    header=None,
)

val = pd.read_table(
    "../datasets/CREMA-D/text/val.txt",
    delimiter=";",
    header=None,
)

test = pd.read_table(
    "../datasets/CREMA-D/text/test.txt",
    delimiter=";",
    header=None,
)

train.columns = ["Text", "Emotion"]
val.columns = ["Text", "Emotion"]
test.columns = ["Text", "Emotion"]

# print("Train head:", train.head())
# print("Train shape:", train.shape)
# print("-" * 10)
# print("Val head:", val.head())
# print("Val shape:", val.shape)
# print("-" * 10)
# print("Test head:", test.head())
# print("Test shape:", test.shape)
# print("-" * 10)

data = pd.concat([train, val, test], ignore_index=True)
data.columns = ["text", "label"]

# print("Data head:", data.head())
print("Data shape:", data.shape)

Data shape: (20000, 2)


In [13]:
data.isna().any(axis=1).sum()

0

### Preprocess Function

The preprocess function is designed to clean and normalize textual data by:

1. **Removing unwanted characters**: Retaining only alphabetic characters.
2. **Normalizing text**: Converting all characters to lowercase.
3. **Tokenizing**: Splitting text into individual words.
4. **Removing stopwords**: Eliminating common words that may not contribute to the analysis (e.g., "and", "the").
5. **Stemming**: Reducing words to their root forms (e.g., "running" → "run").
6. **Reconstructing the text**: Combining the processed words back into a single string.

This process is essential in Natural Language Processing (NLP) tasks, such as emotion detection from text, to clean and prepare textual data for analysis or modeling.

- **PorterStemmer**: A stemming algorithm provided by NLTK to reduce words to their base or root form.(e.g., "connect", "connecting", "connected" → "connect").
- **stopwords**: A corpus in NLTK containing common words (like "the", "is", "in") that are typically removed in preprocessing because they carry minimal semantic value.

In [14]:
import nltk

nltk.download("stopwords")

import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


def preprocess(line):
    review = re.sub("[^a-zA-Z]", " ", line)
    review = review.lower()
    review = review.split()

    # apply Stemming + remove the stopwords
    review = [
        PorterStemmer().stem(word)
        for word in review
        if not word in stopwords.words("english")
    ]

    return " ".join(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Potential Enhancements and Best Practices
# Using Lemmatization Instead of Stemming:

# Difference:

# Stemming: Cuts off word suffixes to get to the root form (e.g., "running" → "run").
# Lemmatization: Uses vocabulary and morphological analysis to return the base or dictionary form of a word (e.g., "better" → "good").
# Advantage: Lemmatization tends to produce more meaningful roots.

################################################################################

# from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')

# lemmatizer = WordNetLemmatizer()

# def preprocess(line):
#     review = re.sub('[^a-zA-Z]', ' ', line)
#     review = review.lower()
#     review = review.split()
#     review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
#     return ' '.join(review)


In [19]:

# Preprocess training, validation, and test datasets
train["text"] = train["Text"].apply(lambda x: preprocess(x))
val["text"] = val["Text"].apply(lambda x: preprocess(x))
test["text"] = test["Text"].apply(lambda x: preprocess(x))

# Encode labels for training, validation, and test sets
label_encoder = preprocessing.LabelEncoder()
train["N_label"] = label_encoder.fit_transform(train["Emotion"])
val["N_label"] = label_encoder.transform(val["Emotion"])  # Use the same encoder
test["N_label"] = label_encoder.transform(test["Emotion"])  # Use the same encoder


?????????????????????????????????????????????????????????????????????????????????????????????????????

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, ngram_range=(1, 3))
X_train = cv.fit_transform(train["text"]).toarray()
X_val = cv.transform(val["text"]).toarray()
X_test = cv.transform(test["text"]).toarray()

y_train = train["N_label"]
y_val = val["N_label"]
y_test = test["N_label"]


In [24]:
model = Sequential()
model.add(Dense(128, input_shape=(X_train.shape[1],), activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(len(label_encoder.classes_), activation="softmax"))

model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ff9508f8e0>

In [25]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Test Loss: %.4f" % test_loss)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100))


Test Loss: 1.1471
Test Accuracy: 84.60%


In [27]:
import numpy as np
