In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# sadness (0), joy (1), love (2), anger (3), fear (4)

df1 = pd.read_csv("train_emo.csv")
df2 = pd.read_csv("test_emo.csv")
df = pd.concat([df1, df2])
df = df.dropna()
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [None]:
x = df["text"].values
y = df["label"].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(x)
result_train = vectorizer.transform(x_train)
result_test = vectorizer.transform(x_test)

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(result_train, y_train)
y_prediction = clf.predict(result_test)

In [None]:
print("precision_score:", precision_score(y_test, y_prediction, average="weighted") * 100, '%')
print("recall_score:", recall_score(y_test, y_prediction, average="weighted") * 100, '%')
print("f1_score:", f1_score(y_test, y_prediction, average="weighted") * 100, '%')

precision_score: 88.38214796331786 %
recall_score: 88.47222222222221 %
f1_score: 88.34924911631568 %


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

In [None]:
wnl = WordNetLemmatizer()

def lemmatize_words(text):
  text_arr = text.split()
  res = " ".join(wnl.lemmatize(word) for word in text_arr)
  return res

In [None]:
stops = set(stopwords.words("english"))

def delete_stopwords(text):
  text_arr = text.split()
  res = " ".join([word for word in text_arr if word not in stops])
  return res

In [None]:
punctuation = string.punctuation

def delete_punctuation(text):
  table = str.maketrans("", "", punctuation)
  res = text.translate(table)
  return res

In [None]:
new_df = pd.DataFrame(columns=["text", "label"])
new_df.head()

Unnamed: 0,text,label


In [None]:
for ind, row in df.iterrows():
  updated_row = pd.DataFrame({"text": lemmatize_words(delete_stopwords(delete_punctuation(row["text"]))), "label": y[ind]}, index = [ind])
  new_df = pd.concat([new_df, updated_row])

In [None]:
new_df.head()

Unnamed: 0,text,label
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,3
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,3


In [None]:
x = new_df["text"].values
y = new_df["label"].values.astype("int")

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(x)
result_train = vectorizer.transform(x_train)
result_test = vectorizer.transform(x_test)

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(result_train, y_train)
y_prediction = clf.predict(result_test)

In [None]:
print("precision_score:", precision_score(y_test, y_prediction, average="weighted") * 100, '%')
print("recall_score:", recall_score(y_test, y_prediction, average="weighted") * 100, '%')
print("f1_score:", f1_score(y_test, y_prediction, average="weighted") * 100, '%')

precision_score: 89.11029897416512 %
recall_score: 89.13888888888889 %
f1_score: 89.09318350606806 %
