In [None]:
# dl packages
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# ml packages
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import pickle
import nltk
import re
from nltk.stem import PorterStemmer

import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud

In [None]:
train_data = pd.read_csv("train.txt", header=None, sep=";", names=["Comment", "Emotion"], encoding="utf-8")
# get all words length in comment
train_data['length'] = [len(x) for x in train_data['Comment']]

In [None]:
train_data

Unnamed: 0,Comment,Emotion,length
0,i didnt feel humiliated,sadness,23
1,i can go from feeling so hopeless to so damned...,sadness,108
2,im grabbing a minute to post i feel greedy wrong,anger,48
3,i am ever feeling nostalgic about the fireplac...,love,92
4,i am feeling grouchy,anger,20
...,...,...,...
15995,i just had a very brief time in the beanbag an...,sadness,101
15996,i am now turning and i feel pathetic that i am...,sadness,102
15997,i feel strong and good overall,joy,30
15998,i feel like this was such a rude comment and i...,anger,59


## Text cleaning,Encoding and Padding

In [None]:
max([len(w) for w in train_data['Comment']])

300

In [None]:
min([len(w) for w in train_data['Comment']])

7

- The reason we are performing this to see how a big and small a sentence can be.
- that's why we are using max_length=300 for this.

In [None]:
nltk.download('stopwords')
stopwords=set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


- **Instead of TfidVectorizer, we will use here one_hot vectorizer.**

#### Understanding one_hot

In [None]:
text=" ".join(["i love myself", "I hate you", "We love ourselves"])

one_hot(input_text=text, n=50)

[33, 16, 21, 33, 36, 21, 35, 16, 7]

- it will give numbers upto 50
- like love is 16 for both of us, i is 33 for both of use.... like that


It will not recommend to use.

In [None]:
text=" ".join(["i love myself", "I hate you", "We love ourselves"])

one_hot(input_text=text, n=3)

[1, 1, 2, 1, 1, 2, 2, 1, 2]

## Function

In [None]:
def clean_text(df, column, vocab_size, max_len):
  stemmer=PorterStemmer()

  corpus=[]
  for text in df[column]:
    text=re.sub(r"[^a-zA-Z]", " ",text)
    text=text.lower()
    text=text.split()
    text=[stemmer.stem(w) for w in text if w not in stopwords]
    text=" ".join(text)

    corpus.append(text)

  one_hot_word=[one_hot(input_text=word, n=vocab_size) for word in corpus]

  # padding you can use pre or post
  pad=pad_sequences(sequences=one_hot_word, maxlen=max_len, padding='pre')
  return pad


X_train=clean_text(train_data, 'Comment', vocab_size=11000, max_len=300 )

In [None]:
X_train.shape

(16000, 300)

In [None]:
# 300 vectors
X_train

array([[    0,     0,     0, ...,  8969,  6569, 10746],
       [    0,     0,     0, ..., 10796,  9490,  9119],
       [    0,     0,     0, ...,  6569,  9912,  5826],
       ...,
       [    0,     0,     0, ..., 10614,  7566,  1041],
       [    0,     0,     0, ...,  1582,  3595,  9800],
       [    0,     0,     0, ...,  6569,   454,   478]], dtype=int32)

### Converting emotion into numerical data

In [None]:
lb=LabelEncoder()
train_data['Emotion']=lb.fit_transform(train_data['Emotion'])

In [None]:
train_data['Emotion']

0        4
1        4
2        0
3        3
4        0
        ..
15995    4
15996    4
15997    2
15998    0
15999    4
Name: Emotion, Length: 16000, dtype: int64

In [None]:
train_data

Unnamed: 0,Comment,Emotion,length
0,i didnt feel humiliated,4,23
1,i can go from feeling so hopeless to so damned...,4,108
2,im grabbing a minute to post i feel greedy wrong,0,48
3,i am ever feeling nostalgic about the fireplac...,3,92
4,i am feeling grouchy,0,20
...,...,...,...
15995,i just had a very brief time in the beanbag an...,4,101
15996,i am now turning and i feel pathetic that i am...,4,102
15997,i feel strong and good overall,2,30
15998,i feel like this was such a rude comment and i...,0,59


#### Convert class vector to binary class metrix
- Suppose
- As you can see Anger is 4, So it will be printed like this
- Second one is anger so, it will same as first
- Third one will be different so it will on the first position 1 and other will be zeros.

In [None]:
to_categorical(train_data['Emotion'])

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

In [None]:
y_train=to_categorical(train_data['Emotion'])

## Model Building and Training

- you have to give output_dim based on the data.
- input_dim is vocab_size.
- input_length is max_length

In [None]:
model=Sequential()

#input layer
model.add(Embedding(input_dim=11000, output_dim=150, input_length=300))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.2))

#Output layer
model.add(Dense(6, activation='softmax'))

#### It is LSTM layer

- Embedding --> It will ad embedding layer. It converts input sequence into dense vectors.
- Dropout --> avoid Overfitting
-LSTM --> Long Short Term Memory

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train,y_train, epochs=5, batch_size=64, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c5c902c2e90>

## Predictive System

In [None]:
def pred(text):
  stemmer=PorterStemmer()
  corpus=[]
  text=re.sub(r"[^a-zA-Z]", " ",text)
  text=text.lower()
  text=text.split()
  text=[stemmer.stem(w) for w in text if w not in stopwords]
  text=" ".join(text)
  corpus.append(text)

  one_hot_word=[one_hot(input_text=word, n=11000) for word in corpus]
  # padding you can use pre or post
  pad=pad_sequences(sequences=one_hot_word, maxlen=300, padding='pre')
  return pad



sentences=[
    'I hates mangoes',
    'I Feel strong and good overall',
    'you are sweet and caring'
]

for sent in sentences:
  cleaned_sent =pred(sent)

  emotion=lb.inverse_transform(np.argmax(model.predict(cleaned_sent), axis=-1))[0]
  label=np.max(model.predict(cleaned_sent))

  print(emotion)
  print(label)
  print("================================")

anger
0.6670256
joy
0.9952656
love
0.81134474


## Save the model

In [None]:
model.save('model.h56')