In [15]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ertso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# **Loading and processing data**

In [None]:
# Emotions
# https://www.kaggle.com/datasets/nelgiriyewithana/emotions

df = pd.read_csv("text.csv", delimiter=",").drop("Unnamed: 0", axis=1)

In [18]:
sw = stopwords.words('english')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [19]:
df["text"] = df["text"].apply(lambda x: clean_text(x))

In [20]:
vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(df['text'])

y = df['label']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# **Creating models**

## **Logistic Regression (multinominal model)**

In [22]:
mnm = LogisticRegression(
    tol = 0.0001,
    C = 0.35,
    fit_intercept = False,
    random_state = 42,
    solver = "sag",
    max_iter = 10000,
    n_jobs = -1,
)

mnm.fit(X_train, y_train)

In [23]:
y_pred = mnm.predict(X_test)

print(mnm.score(X_train, y_train))
print(mnm.score(X_test, y_test))

0.9166363436431614
0.8924989803507594


In [26]:
joblib.dump(mnm, "emo_clf_mnm.sav")

['emo_clf_mnm.sav']

## **SGDClassifier**

In [24]:
sgdc = SGDClassifier(
    loss="hinge",
    penalty="l2",
    alpha=0.000005,
    max_iter=10000,
    tol=0.001,
    shuffle=True,
    n_jobs=-1,
    random_state=42,
    learning_rate="invscaling",
    eta0=10,
    power_t=0.4,
    validation_fraction=0.01,
    n_iter_no_change=1000
)

sgdc.fit(X_train, y_train)

In [25]:
y_pred = sgdc.predict(X_test)

print(sgdc.score(X_train, y_train))
print(sgdc.score(X_test, y_test))

0.9276006157906311
0.8923670257431443


In [27]:
joblib.dump(mnm, "emo_clf_sgdc.sav")

['emo_clf_sgdc.sav']

# **Predicting the emotion**

In [28]:
def clf_mnm(string: str) -> str:
    
    vector = vectorizer.transform([string])
    pred = mnm.predict(vector)[0]

    if pred==0: return "sadness"
    elif pred==1: return "joy"
    elif pred==2: return "love"
    elif pred==3: return "anger"
    elif pred==4: return "fear"
    else: return "suprise"



In [29]:
def clf_sgdc(string: str) -> str:
    
    vector = vectorizer.transform([string])
    pred = sgdc.predict(vector)[0]

    if pred==0: return "sadness"
    elif pred==1: return "joy"
    elif pred==2: return "love"
    elif pred==3: return "anger"
    elif pred==4: return "fear"
    else: return "suprise"