In [522]:
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ertso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [523]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# **Loading data**

In [524]:
# Social Media Sentiments Analysis Dataset
# https://www.kaggle.com/datasets/kashishparmar02/social-media-sentiments-analysis-dataset

smsad_df = pd.read_csv("sentimentdataset.csv", delimiter=",")
smsad_df = smsad_df[["Text", "Sentiment"]].rename(columns={"Text": "text", "Sentiment": "label"})

In [525]:
# Twitter Emotion
# https://www.kaggle.com/datasets/phantomrider/twitter-emotion

tw_df = pd.read_csv("twitter emotions.csv", delimiter=",")[["text", "emotion"]].rename(columns={"emotion": "label"})

In [526]:
# Emotion Detection from Text
# https://www.kaggle.com/datasets/pashupatigupta/emotion-detection-from-text

edt_df = pd.read_csv("tweet_emotions.csv", delimiter=",").rename(columns={"content": "text", "sentiment": "label"}).drop("tweet_id", axis=1)

In [527]:
# emotion analysis based on text
# https://www.kaggle.com/datasets/simaanjali/emotion-analysis-based-on-text

eabt_df = pd.read_csv("emotion_sentimen_dataset.csv", delimiter=",").rename(columns={"Emotion": "label"}).drop("Unnamed: 0", axis=1)
eabt_df = pd.concat([eabt_df[eabt_df["label"] != "neutral"], eabt_df[eabt_df["label"] == "neutral"].sample(frac=0.2)], ignore_index=True).reset_index().drop("index", axis=1)

In [528]:
# Emotion Classification NLP
# https://www.kaggle.com/datasets/anjaneyatripathi/emotion-classification-nlp

ecnlp_df = pd.concat([
    pd.read_csv("emotion-labels-test.csv", delimiter=","),
    pd.read_csv("emotion-labels-train.csv", delimiter=","),
    pd.read_csv("emotion-labels-val.csv", delimiter=",")
]).drop_duplicates().reset_index().drop("index", axis=1)

In [529]:
# Sentiment & Emotions Labelled Tweets
# https://www.kaggle.com/datasets/ankitkumar2635/sentiment-and-emotions-of-tweets

selt_df = pd.read_csv("sentiment-emotion-labelled_Dell_tweets.csv", delimiter=",").rename(columns={"Text": "text", "emotion": "label"})[["text", "label"]]

In [530]:
# Emotion Dataset Raw
# https://www.kaggle.com/datasets/rikinzala/emotion-dataset-raw

edr_df = pd.read_csv("emotion_dataset_raw.csv", delimiter=",").rename(columns={"Text": "text", "Emotion": "label"})

In [531]:
df = pd.concat([smsad_df, edt_df, ecnlp_df, tw_df, eabt_df, selt_df, edr_df], ignore_index=True).drop_duplicates().reset_index().drop("index", axis=1)

In [532]:
df

Unnamed: 0,text,label
0,Enjoying a beautiful day at the park! ...,Positive
1,Traffic was terrible this morning. ...,Negative
2,Just finished an amazing workout! 💪 ...,Positive
3,Excited about the upcoming weekend getaway! ...,Positive
4,Trying out a new recipe for dinner tonight. ...,Neutral
...,...,...
299196,@MichelGW have you gift! Hope you like it! It'...,surprise
299197,The world didnt give it to me..so the world MO...,joy
299198,A man robbed me today .,anger
299199,"Youu call it JEALOUSY, I call it of #Losing YO...",fear


# **Processing data**

In [533]:
sw = stopwords.words('english')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [534]:
def processor(df: pd.DataFrame) -> pd.DataFrame:

    df["text"] = df["text"].apply(lambda x: clean_text(x))
    df["label"] = df["label"].apply(lambda x: x.lower().strip())

    del_list = ['culinary adventure', 'culinaryodyssey', 'emotion', 'emotionalstorm', 
                'envisioning history', 'friendship', 'grandeur', 'hypnotic', 'iconic', 
                'imagination', 'immersion', 'innerjourney', 'journey', "nature's beauty", 
                'obstacle', "ocean's freedom", 'pressure', 'renewed effort', 'rejuvenation', 
                'reflection', 'radiance', 'ruins', 'runway creativity', 'thrilling journey', 
                'whispers of the past', 'winter magic', 'yearning',  'artisticburst',
                'celestial wonder', 'dreamchaser', 'elation', 'bittersweet', 'breakthrough', 'connection',
                'ambivalence', 'dazzle', 'free-spirited', 'freedom', 'freedom', 'surprise', 'suspense', 
                'vibrancy', 'whimsy']

    for i in del_list:

        df = df[df["label"] != i]

    df = df.reset_index().drop("index", axis=1)

    sim_dict = {
        'bitter': ['bitterness'], 'confidence': ['confident'], 'creativity': ['creativity inspiration'], 'despair': ['desperation'],
        'disappointment': ['disappointed'], 'fear': ['fearful'], 'frustration': ['frustrated'], 'happiness': ['happy'],
        'gratitude': ['grateful'], 'heartache': ['heartbreak'], 'hope': ['hopeful'], 'kindness': ['kind'],
        'joy': ['joy in baking', 'joyfulreunion', 'overjoyed'], 'positive': ['positivity'], 'pride': ['proud'],
        'sad': ['sadness'], 'wonder': ['wonderment'], 'compassion': ['compassionate'], 'creative': ['creative inspiration'],
        'envy': ['envious']
    }

    sim_dict = dict({(v, key) for key, value in sim_dict.items() for v in value})


    df["label"] = df["label"].apply(lambda x: sim_dict[x] if x in sim_dict.keys() else x)

    rep_dict = {
        "negative": [
            'anger', 'anxiety', 'apprehensive', 'bad', 'betrayal', 'bitter', 'boredom', 'contemplation', 'darkness',
            'desolation', 'despair', 'devastated', 'disappointment', 'disgust', 'dismissive', 'embarrassed', 'empty',
            'envy', 'exhaustion', 'fear', 'frustration', 'grief', 'hate', 'heartache', 'helplessness', 'intimidation',
            'isolation', 'jealous', 'jealousy', 'loneliness', 'loss', 'lostlove', 'mischievous', 'negative', 'numbness',
            'overwhelmed', 'regret', 'resentment', 'sad', 'shame', 'sorrow', 'suffering', 'worry', 
        ],
        "positive": [
            'acceptance', 'accomplishment','admiration', 'adoration', 'adrenaline', 'adventure', 'affection', 'amazement',
            'amusement', 'appreciation', 'arousal', 'awe', 'blessed', 'captivation', 'celebration', 'charm', 'colorful',
            'compassion', 'confidence', 'confusion', 'contentment', 'coziness', 'creative', 'creativity', 'ecstasy',
            'elegance', 'empathetic', 'enchantment', 'enjoyment', 'enthusiasm', 'euphoria', 'excitement', 'fulfillment',
            'fun', 'gratitude', 'happiness', 'harmony', 'heartwarming', 'hope', 'joy', 'kindness', 'love', 'marvel', 
            'mesmerizing', 'optimism', 'positive', 'pride', 'relief', 'reverence', 'romance', 'satisfaction', 'spark',
            'success', 'sympathy', 'tenderness', 'thrill', 'touched', 'triumph', 'wonder', 'zest', 'festivejoy',
            'inspiration', 'inspired', 'playfuljoy', 'playful',
        ],
        "neutral": [
            'anticipation', 'calmness', 'challenge', 'curiosity', 'determination', 'empowerment', 'energy', 'engagement',
            'exploration', 'indifference', 'intrigue', 'melancholy', 'melodic', 'mindfulness', 'miscalculation', 'motivation', 
            'neutral', 'nostalgia', 'pensive', 'resilience', 'serenity', 'solace', 'solitude', 'tranquility', 
        ]
    }
    
    rep_dict = dict({(v, key) for key, value in rep_dict.items() for v in value})

    df["label"] = df["label"].apply(lambda x: rep_dict[x])

    return df

In [535]:
df = processor(df)

In [536]:
df.to_csv("emo_dataset.csv", sep=";")

In [537]:
le = LabelEncoder()

le.fit(df["label"])
df["label"] = le.transform(df["label"])
# df["label"] = le_.inverse_transform(df["label"))

In [538]:
vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(df['text'])

y = df['label']

In [539]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# **Creating a model**

## **Logistic Regression (multinominal model)**

In [540]:
mnm = LogisticRegression(
    tol = 0.0001,
    C = 0.35,
    fit_intercept = False,
    random_state = 42,
    solver = "sag",
    max_iter = 10000,
    n_jobs = -1,
)

mnm.fit(X_train, y_train)

In [541]:
y_pred = mnm.predict(X_test)

print(mnm.score(X_train, y_train))
print(mnm.score(X_test, y_test))

0.88139750462054
0.8443275641689986


In [542]:
joblib.dump(mnm, "emo_text_clf_mnm.sav")

['emo_text_clf_mnm.sav']

## **SGDClassifier**

In [543]:
sgdc = SGDClassifier(
    loss="hinge",
    penalty="l2",
    alpha=0.000005,
    max_iter=10000,
    tol=0.001,
    shuffle=True,
    n_jobs=-1,
    random_state=42,
    learning_rate="invscaling",
    eta0=10,
    power_t=0.4,
    validation_fraction=0.01,
    n_iter_no_change=1000
)

sgdc.fit(X_train, y_train)

In [544]:
y_pred = sgdc.predict(X_test)

print(sgdc.score(X_train, y_train))
print(sgdc.score(X_test, y_test))

0.9250522509658512
0.8555377068435417


In [545]:
joblib.dump(mnm, "emo_text_clf_sgdc.sav")

['emo_text_clf_sgdc.sav']

# **Predicting the category**

In [546]:
def clf_mnm(string: str) -> str:
    
    vector = vectorizer.transform([string])
    prediction = le.inverse_transform(mnm.predict(vector))

    return prediction[0]

In [547]:
def clf_sgdc(string: str) -> str:
    
    vector = vectorizer.transform([string])
    prediction = le.inverse_transform(sgdc.predict(vector))

    return prediction[0]