In [None]:
import pandas as pd

train = pd.read_table(
    "./datasets/Emotion Detection from Text/train.txt",
    delimiter=";",
    header=None,
)

val = pd.read_table(
    "./datasets/Emotion Detection from Text/val.txt",
    delimiter=";",
    header=None,
)

test = pd.read_table(
    "./datasets/Emotion Detection from Text/test.txt",
    delimiter=";",
    header=None,
)

train.columns = ["Text", "Emotion"]
val.columns = ["Text", "Emotion"]
test.columns = ["Text", "Emotion"]

# print("Train head:", train.head())
# print("Train shape:", train.shape)
# print("-" * 10)
# print("Val head:", val.head())
# print("Val shape:", val.shape)
# print("-" * 10)
# print("Test head:", test.head())
# print("Test shape:", test.shape)
# print("-" * 10)

data = pd.concat([train, val, test], ignore_index=True)
data.columns = ["text", "label"]

# print("Data head:", data.head())
print("Data shape:", data.shape)

In [None]:
data.isna().any(axis=1).sum()

### Preprocess Function

The preprocess function is designed to clean and normalize textual data by:

1. **Removing unwanted characters**: Retaining only alphabetic characters.
2. **Normalizing text**: Converting all characters to lowercase.
3. **Tokenizing**: Splitting text into individual words.
4. **Removing stopwords**: Eliminating common words that may not contribute to the analysis (e.g., "and", "the").
5. **Stemming**: Reducing words to their root forms (e.g., "running" → "run").
6. **Reconstructing the text**: Combining the processed words back into a single string.

This process is essential in Natural Language Processing (NLP) tasks, such as emotion detection from text, to clean and prepare textual data for analysis or modeling.

- **PorterStemmer**: A stemming algorithm provided by NLTK to reduce words to their base or root form.(e.g., "connect", "connecting", "connected" → "connect").
- **stopwords**: A corpus in NLTK containing common words (like "the", "is", "in") that are typically removed in preprocessing because they carry minimal semantic value.

In [None]:
import nltk

nltk.download("stopwords")

import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


def preprocess(line):
    review = re.sub("[^a-zA-Z]", " ", line)
    review = review.lower()
    review = review.split()

    # apply Stemming + remove the stopwords
    review = [
        PorterStemmer().stem(word)
        for word in review
        if not word in stopwords.words("english")
    ]

    return " ".join(review)

In [63]:
# Potential Enhancements and Best Practices
# Using Lemmatization Instead of Stemming:

# Difference:

# Stemming: Cuts off word suffixes to get to the root form (e.g., "running" → "run").
# Lemmatization: Uses vocabulary and morphological analysis to return the base or dictionary form of a word (e.g., "better" → "good").
# Advantage: Lemmatization tends to produce more meaningful roots.

################################################################################

# from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')

# lemmatizer = WordNetLemmatizer()

# def preprocess(line):
#     review = re.sub('[^a-zA-Z]', ' ', line)
#     review = review.lower()
#     review = review.split()
#     review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
#     return ' '.join(review)


In [64]:
data["text"] = data["text"].apply(lambda x: preprocess(x))

In [65]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
data["N_label"] = label_encoder.fit_transform(data["label"])

?????????????????????????????????????????????????????????????????????????????????????????????????????

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, ngram_range=(1, 3))
data_cv = cv.fit_transform(data["text"]).toarray()

In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_cv, data["N_label"], test_size=0.25, random_state=42
)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(12, input_shape=(X_train.shape[1],), activation="relu"))
model.add(Dense(8, activation="relu"))
model.add(Dense(6, activation="softmax"))

model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

model.fit(X_train, y_train, epochs=10, batch_size=10)

_, accuracy = model.evaluate(X_train, y_train)
print("Accuracy: %.2f" % (accuracy * 100))

In [None]:
import numpy as np

text = "I feel sad"
text = preprocess(text)
array = cv.transform([text]).toarray()
pred = model.predict(array)
a = np.argmax(pred, axis=1)
label_encoder.inverse_transform(a)[0]

In [None]:
import tensorflow as tf
import pickle

tf.keras.models.save_model(model, "my_model.h5")

pickle.dump(label_encoder, open('encoder.pkl', 'wb'))
pickle.dump(cv, open('CountVectorizer.pkl', 'wb'))