In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("train.txt", sep=";", header=None, names=["text", "emotions"])

In [None]:
df.head()

# data cleaning and preprocessing


In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df["emotions"].unique()

In [None]:
# convert emotions to numbers

unique_emotions = df["emotions"].unique()
emotion_numbers = {}
# Assign a unique number to each emotion
i = 0
for emotion in unique_emotions:
    emotion_numbers[emotion] = i
    i += 1

df["emotions"] = df["emotions"].map(emotion_numbers)

In [None]:
emotion_numbers

In [None]:
df.head()

In [None]:
# converting all text to lower case
df["text"] = df["text"].str.lower()

In [None]:
df.head()

In [None]:
# removing punctuation
import string


def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))


df["text"] = df["text"].apply(remove_punctuation)

In [None]:
df.head()

In [None]:
# removing numbers from text
def remove_numbers(text):
    new = ""
    for i in text:
        if not i.isdigit():
            new += i
    return new


df["text"] = df["text"].apply(remove_numbers)

In [None]:
df.head()

In [None]:
# removing urls/links from text
import re


def remove_urls(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)


df["text"] = df["text"].apply(remove_urls)

In [None]:
df.head()

In [None]:
# removing extra spaces from text
def remove_extra_spaces(text):
    return re.sub(r"\s+", " ", text).strip()


df["text"] = df["text"].apply(remove_extra_spaces)

In [None]:
df.head()

In [None]:
# removing emojis from text
def remove_emojis(text):
    new = ""
    for i in text:
        if i.isascii():
            new += i
    return new


df["text"] = df["text"].apply(remove_emojis)

In [None]:
df.head()

In [None]:
# removing stopwords using nltk library
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# download stopwords
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")

In [None]:
eng_stopwords = set(stopwords.words("english"))

In [None]:
len(eng_stopwords)

In [None]:
type(eng_stopwords)

In [None]:
df["text"].loc[1]

In [None]:
# tokenzing and removing stopwords
def remove_stopwords(text):
    words = word_tokenize(text)
    cleaned_text = []

    for word in words:
        if word not in eng_stopwords:
            cleaned_text.append(word)

    # Join the cleaned words back into a single string
    return " ".join(cleaned_text)


df["text"] = df["text"].apply(remove_stopwords)

In [None]:
df["text"].loc[1]

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# separating features and labels
x = df["text"]
y = df["emotions"]

In [None]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [None]:
X_train.shape, X_test.shape

## model training using bow


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_vectorizer = CountVectorizer()

In [None]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_train_bow

In [None]:
X_test_bow = bow_vectorizer.transform(X_test)
X_test_bow

In [None]:
# logistic regression classifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr_model = LogisticRegression(max_iter=1000, n_jobs=-1)
lr_model.fit(X_train_bow, y_train)
y_pred = lr_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Logistic Regression model: {accuracy * 100:.2f}%")

In [None]:
#saving the model
import joblib
joblib.dump(lr_model, "logistic_regression_model.pkl")
