<a href="https://colab.research.google.com/github/engineereliab076/my-projects/blob/main/EMOTION_CLASSIFIER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from sklearn.metrics import classification_report

In [18]:
data = pd.read_csv('/content/Emotion_classify_Data.csv')

In [19]:
data.head(5)

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [20]:
data.describe()

Unnamed: 0,Comment,Emotion
count,5937,5937
unique,5934,3
top,i feel like a tortured artist when i talk to her,anger
freq,2,2000


In [21]:
data.shape

(5937, 2)

In [22]:
data['Emotion'].value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [23]:
data['Emotion_label'] = data['Emotion'].map({'anger':0, 'joy':1, 'fear':2})

In [24]:
data.head()

Unnamed: 0,Comment,Emotion,Emotion_label
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,1
4,i feel suspicious if there is no one outside l...,fear,2


Train test split

In [25]:
x_train,x_test,y_train,y_test = train_test_split(
    data.Comment,data.Emotion_label,
    train_size=0.2,
    stratify=data.Emotion_label,
    random_state=42
    )

In [26]:
print(x_train.shape,x_test.shape)

(1187,) (4750,)


In [27]:
y_train.value_counts()


1    400
0    400
2    387
Name: Emotion_label, dtype: int64

MODEL TRAINING

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

1.RANDOM FOREST... cv = all trigrams

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
clf = Pipeline([
    ('vectorizer_tri_grams', CountVectorizer(ngram_range = (3, 3))),                       #using the ngram_range parameter
    ('random_forest', (RandomForestClassifier()))
])

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.07      0.47      0.13       247
           1       0.03      0.60      0.05        70
           2       0.95      0.33      0.49      4433

    accuracy                           0.34      4750
   macro avg       0.35      0.47      0.22      4750
weighted avg       0.89      0.34      0.47      4750



2.Multinomial naive bayes

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
clf = Pipeline([
    ('vectorizer_bigrams', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.81      0.72      0.76      1797
           1       0.68      0.78      0.73      1385
           2       0.76      0.75      0.76      1568

    accuracy                           0.75      4750
   macro avg       0.75      0.75      0.75      4750
weighted avg       0.75      0.75      0.75      4750



3.RANDOM FOREST CLF WITH UNI AND BIGRAM CV

In [33]:
clf = Pipeline([
    ('vectorizer_bi_grams', CountVectorizer(ngram_range = (1, 2))),                       #using the ngram_range parameter
    ('random_forest', (RandomForestClassifier()))
])

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.69      0.89      0.78      1231
           1       0.92      0.64      0.76      2288
           2       0.70      0.88      0.78      1231

    accuracy                           0.77      4750
   macro avg       0.77      0.81      0.77      4750
weighted avg       0.80      0.77      0.77      4750



4.RANDOM FOREST CLF WITH TF-IDF VECTORIZER

In [34]:
clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.72      0.80      0.76      1435
           1       0.84      0.69      0.76      1966
           2       0.75      0.87      0.81      1349

    accuracy                           0.77      4750
   macro avg       0.77      0.79      0.77      4750
weighted avg       0.78      0.77      0.77      4750



with the preprocessed data

In [35]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

filtered_token=[]
def preprocess(text):
  doc = nlp(text)
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_token.append(token.lemma_)

  return ' '.join(filtered_token)


In [36]:
data['processed_comment'] = data['Comment'].apply(preprocess)

In [37]:
data.head()

Unnamed: 0,Comment,Emotion,Emotion_label,processed_comment
0,i seriously hate one subject to death but now ...,fear,2,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,0,seriously hate subject death feel reluctant dr...
2,i sit here to write i start to dig out my feel...,fear,2,seriously hate subject death feel reluctant dr...
3,ive been really angry with r and i feel like a...,joy,1,seriously hate subject death feel reluctant dr...
4,i feel suspicious if there is no one outside l...,fear,2,seriously hate subject death feel reluctant dr...


In [38]:
data['Comment'][0]

'i seriously hate one subject to death but now i feel reluctant to drop it'

In [39]:
data['processed_comment'][0]

'seriously hate subject death feel reluctant drop'

train test split

In [40]:
x_train,x_test,y_train,y_test = train_test_split(
    data.processed_comment,
    data.Emotion_label,train_size=0.2,
    stratify=data.Emotion_label,
    random_state=2022
)

In [41]:
x_train.shape

(1187,)

In [42]:
y_train.value_counts()

1    400
0    400
2    387
Name: Emotion_label, dtype: int64

model training wc preprocessed data

1.RANDOM FOREST WC UNI AND BIGRAM CV

In [43]:
clf = Pipeline([
     ('vectorizer_bi_grams', CountVectorizer(ngram_range = (1, 2))),
     ('random_forest', (RandomForestClassifier()))
])

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.35      0.34      0.35      1624
           1       0.34      0.35      0.34      1557
           2       0.34      0.34      0.34      1569

    accuracy                           0.34      4750
   macro avg       0.34      0.34      0.34      4750
weighted avg       0.34      0.34      0.34      4750



2.RANDOM FOREST WC TF-IDF VECTORORIZER

In [44]:
clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier())
])

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.35      0.35      0.35      1615
           1       0.35      0.35      0.35      1597
           2       0.34      0.35      0.35      1538

    accuracy                           0.35      4750
   macro avg       0.35      0.35      0.35      4750
weighted avg       0.35      0.35      0.35      4750

