<a href="https://colab.research.google.com/github/desaraju02/ai_ml_dl/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


In [6]:
df = pd.read_csv('Emotion_Analysis.csv')
df

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
...,...,...
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that...,anger
5934,i were to ever get married i d have everything...,joy
5935,i feel reluctant in applying there because i w...,fear


In [7]:
df["Emotion"].value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
anger,2000
joy,2000
fear,1937


In [8]:
print(f"Comment: {df['Comment'][0]} -->{df['Emotion'][0]}")

Comment: i seriously hate one subject to death but now i feel reluctant to drop it -->fear


### Preprocessing of Data

In [9]:
nlp_model = spacy.load('en_core_web_sm')

In [10]:
sample_text = df['Comment'][3]

In [12]:
doc = nlp_model(sample_text)
doc

ive been really angry with r and i feel like an idiot for trusting him in the first place

In [13]:
for x in doc:
  print(x)

i
ve
been
really
angry
with
r
and
i
feel
like
an
idiot
for
trusting
him
in
the
first
place


Stopwords

In [14]:
for token in doc:
  if token.is_stop or token.is_punct:
    print(token)

i
been
really
with
and
i
an
for
him
in
the
first


Create preprocessing function

In [15]:
def preprocess(text):
  # Remove the stopwords and lemmatize
  docs = nlp_model(text)
  filtered_token = []

  for token in docs:
    if(token.is_stop or token.is_punct):
      continue
    filtered_token.append(token.lemma_)

  return " ".join(filtered_token)

In [16]:
print(sample_text)

ive been really angry with r and i feel like an idiot for trusting him in the first place


In [17]:
preprocess(sample_text)

've angry r feel like idiot trust place'

Applying this function on the comment column

In [18]:
df['preprocessed_comments'] = df['Comment'].apply(preprocess)

In [19]:
df.head()

Unnamed: 0,Comment,Emotion,preprocessed_comments
0,i seriously hate one subject to death but now ...,fear,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,feel suspicious outside like rapture happen


### Encoding the Target columns

In [20]:
#df['emotion_encoded'] = df['Emotion'].map({'joy':0,'anger':2,'fear':1,'sadness':3,'disgust':4,'shame':5})
df['emotion_encoded'] = df['Emotion'].map({'joy':0,'anger':2,'fear':1})

In [21]:
df.head()

Unnamed: 0,Comment,Emotion,preprocessed_comments,emotion_encoded
0,i seriously hate one subject to death but now ...,fear,seriously hate subject death feel reluctant drop,1
1,im so full of life i feel appalled,anger,m life feel appalled,2
2,i sit here to write i start to dig out my feel...,fear,sit write start dig feeling think afraid accep...,1
3,ive been really angry with r and i feel like a...,joy,ve angry r feel like idiot trust place,0
4,i feel suspicious if there is no one outside l...,fear,feel suspicious outside like rapture happen,1


In [22]:
X_train,X_test,y_train,y_test = train_test_split(df['preprocessed_comments'],df['emotion_encoded'],test_size=0.2,random_state=42,stratify=df['emotion_encoded'])

In [23]:
X_train.shape, X_test.shape

((4749,), (1188,))

### Convert the text into numerical vectors

In [24]:
vectorizer = TfidfVectorizer()

In [25]:
X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)

In [26]:
print(X_train_cv.shape,X_test_cv.shape)

(4749, 6126) (1188, 6126)


In [31]:

print(type(X_train_cv))

<class 'scipy.sparse._csr.csr_matrix'>


### Model Building

In [None]:
### 4.1 Naive Bayes

In [None]:
### 4.2 Random Forest

In [32]:
naive_model = MultinomialNB()
naive_model.fit(X_train_cv,y_train)

In [33]:
naive_preds = naive_model.predict(X_test_cv)

In [34]:
accuracy_score(y_test,naive_preds)

0.9031986531986532

In [None]:
# prompt: print classification report in a tabular format

from sklearn.metrics import classification_report

print(classification_report(y_test, naive_preds))


In [36]:
print(classification_report(y_test,naive_preds))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89       400
           1       0.91      0.90      0.91       388
           2       0.90      0.92      0.91       400

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



### Now Random Forest

In [37]:
rf = RandomForestClassifier()
rf.fit(X_train_cv,y_train)

In [38]:
rf_preds = rf.predict(X_test_cv)

In [39]:
accuracy_score(y_test,rf_preds)

0.9292929292929293

In [41]:
print(classification_report(y_test,rf_preds))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       400
           1       0.92      0.93      0.92       388
           2       0.94      0.91      0.93       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [42]:
# prompt: tune hyper parameters of the above Random Forest model to improve the accuracy

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train_cv, y_train)

# Print the best hyperparameters and the best score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_rf_model = grid_search.best_estimator_
best_rf_preds = best_rf_model.predict(X_test_cv)
print(classification_report(y_test, best_rf_preds))


KeyboardInterrupt: 

# Testing the model

In [43]:
test_txt = df['Comment'][2000]

In [44]:
print(test_txt)

im looking good and feeling good other than this crappy cold im dealing with


In [46]:
test_txt = preprocess(test_txt)
test_txt

'm look good feel good crappy cold m deal'

In [None]:
### Convert to vectors

In [47]:
test_txt_cv = vectorizer.transform([test_txt])

In [48]:
print(test_txt_cv)

  (0, 959)	0.34486145778167665
  (0, 1169)	0.51872211487043
  (0, 1289)	0.40368862009319334
  (0, 1976)	0.06933463954901763
  (0, 2296)	0.5924339259351207
  (0, 3181)	0.3053656687448782


In [49]:
test_txt_pred = rf.predict(test_txt_cv)

In [51]:
test_txt_pred, df['Emotion'][2000]

(array([0]), 'joy')