In [1]:
import pandas as pd
import numpy as np

In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('Emotion_classify_Data.csv')

In [4]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [17]:
df.shape

(5937, 3)

In [18]:
df['Emotion'].value_counts()

Emotion
0    2000
1    2000
2    1937
Name: count, dtype: int64

In [19]:
#Binary Encoding Categorical to Numeric of Gender and Ever_Married
label_encode={"Emotion": {"anger":0,"joy" :1,"fear":2}}
df=df.replace(label_encode)


In [20]:
df.head()

Unnamed: 0,Comment,Emotion,clean_text
0,i seriously hate one subject to death but now ...,2,i seriously hate one subject to death but now ...
1,im so full of life i feel appalled,0,im so full of life i feel appalled
2,i sit here to write i start to dig out my feel...,2,i sit here to write i start to dig out my feel...
3,ive been really angry with r and i feel like a...,1,ive been really angry with r and i feel like a...
4,i feel suspicious if there is no one outside l...,2,i feel suspicious if there is no one outside l...


In [21]:
def cleansing(df):
    df_clean=df.str.lower()
    df_clean=[re.sub(r"\d+","",i )for i in df_clean]
    df_clean=[re.sub(r'[^\w]', ' ', i)for i in df_clean]
    df_clean=[re.sub(r'\s+',' ',i)for i in df_clean]
    return df_clean
    

In [22]:
df['clean_text']=cleansing(df['Comment'])

In [23]:
df.head()

Unnamed: 0,Comment,Emotion,clean_text
0,i seriously hate one subject to death but now ...,2,i seriously hate one subject to death but now ...
1,im so full of life i feel appalled,0,im so full of life i feel appalled
2,i sit here to write i start to dig out my feel...,2,i sit here to write i start to dig out my feel...
3,ive been really angry with r and i feel like a...,1,ive been really angry with r and i feel like a...
4,i feel suspicious if there is no one outside l...,2,i feel suspicious if there is no one outside l...


In [24]:
#check maximum length of word in sentence 
max_sen = df['clean_text'].str.split().str.len().max()

In [25]:
max_sen

64

**Split the Data**

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

In [27]:
x_train, x_test, y_train, y_test = train_test_split(df['clean_text'], df['Emotion'], test_size = 0.2, random_state = 42,stratify=df['Emotion'])

In [28]:
len(x_train)

4749

In [29]:
 x_train.str.split().str.len().max()

64

**Remove Stopword and Tokenization**

In [30]:
#Define Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
# Get a list of stop words in the English language
stop_words = set(stopwords.words('english'))

# Display the top 20 stop words
list(stop_words)[:20]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liliayu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['most',
 'very',
 "aren't",
 "wouldn't",
 'have',
 "she's",
 'needn',
 "wasn't",
 'on',
 'been',
 'few',
 'of',
 "don't",
 'to',
 "couldn't",
 'had',
 'm',
 've',
 'under',
 'just']

In [31]:
#tokenization
word_token=[word_tokenize(i) for i in x_train]

In [32]:
word_token

[['i',
  'fear',
  'that',
  'other',
  'people',
  'ask',
  'me',
  'about',
  'my',
  'feelings',
  'i',
  'am',
  'most',
  'reluctant',
  'to',
  'talk',
  'about',
  'things'],
 ['ive',
  'been',
  'thinking',
  'about',
  'what',
  'it',
  'is',
  'that',
  'drives',
  'me',
  'not',
  'only',
  'with',
  'fashion',
  'as',
  'pretentious',
  'as',
  'this',
  'is',
  'gon',
  'na',
  'make',
  'me',
  'sound',
  'i',
  'am',
  'studying',
  'fashion',
  'design',
  'so',
  'i',
  'do',
  'feel',
  'its',
  'kinda',
  'vital',
  'to',
  'understand',
  'what',
  'im',
  'trying',
  'to',
  'do',
  'there',
  'but',
  'in',
  'life',
  'as',
  'a',
  'whole'],
 ['i', 'love', 'comments', 'so', 'feel', 'free'],
 ['i',
  'feel',
  'irritable',
  'like',
  'no',
  'other',
  'and',
  'running',
  'will',
  'def',
  'cure',
  'that'],
 ['i',
  'dont',
  'have',
  'a',
  'yeast',
  'infection',
  'in',
  'the',
  'vagina',
  'i',
  'could',
  'be',
  'feeling',
  'irritated',
  'by',
  

In [33]:
len(word_token)

4749

In [34]:
# Remove stopwords from each sublist in word_token
filtered_tokens_train = [[word for word in sublist if word not in stop_words] for sublist in word_token]

# Display the first sublist of filtered tokens after removing stopwords
print(filtered_tokens_train[0])

['fear', 'people', 'ask', 'feelings', 'reluctant', 'talk', 'things']


In [35]:
print(filtered_tokens_train[1])

['ive', 'thinking', 'drives', 'fashion', 'pretentious', 'gon', 'na', 'make', 'sound', 'studying', 'fashion', 'design', 'feel', 'kinda', 'vital', 'understand', 'im', 'trying', 'life', 'whole']


**Vectorization Word2Vec - Skipgram**

In [36]:
import gensim
from gensim.models import Word2Vec 
model_skipgram = gensim.models.Word2Vec(filtered_tokens_train, min_count = 1,vector_size = 100, window = 3, sg=1) 

In [37]:
vocabulary_skipgram = model_skipgram.wv.index_to_key
word_vec_dict={}
for word in vocabulary_skipgram:
    word_vec_dict[word]=model_skipgram.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict)) # should come equal to vocab size
  

The no of key-value pairs :  7769


In [38]:
word_vec_dict

{'feel': array([-0.09020506,  0.21579523,  0.10908302,  0.13433829,  0.0090368 ,
        -0.31923527,  0.19597046,  0.43997312, -0.18532445, -0.28009373,
         0.0292987 , -0.33271557, -0.01520094,  0.10062836,  0.08899105,
        -0.1582728 ,  0.12101362, -0.23911166, -0.07025401, -0.49100935,
         0.15693381,  0.03506279,  0.26180148, -0.05387862, -0.08609644,
        -0.08325439, -0.13355212, -0.04143945, -0.25806916,  0.02062106,
         0.15884927,  0.02198538,  0.03363397, -0.38116726, -0.21337089,
         0.23273526,  0.12432735, -0.19992536, -0.21426883, -0.31394154,
         0.04396219, -0.23854531, -0.11652745, -0.06884799,  0.28156453,
        -0.10782237, -0.15000373,  0.01009492,  0.13594715,  0.2142343 ,
         0.17312214, -0.21432966, -0.14749111, -0.05102328, -0.19565779,
         0.17793283,  0.11327071, -0.11855095, -0.256775  ,  0.25534105,
         0.07638612,  0.05286911,  0.04689517, -0.09358364, -0.27271143,
         0.16877548, -0.0257697 ,  0.234714

In [39]:
max_sen_len= max_sen # max lenght of word in a sentence
vocab_size =35000  #ideally it should be len(tok.word_index) + 1  or total no of words in data in this case = 4750, but to handle number of data which not appear in train, for example in test, make the size higher
embed_dim=100 # embedding dimension as choosen in word2vec constructor

In [40]:
from keras.preprocessing.text import one_hot,Tokenizer
tok = Tokenizer()
tok.fit_on_texts(filtered_tokens_train)
vocab_size = vocab_size
encd_rev = tok.texts_to_sequences(filtered_tokens_train)

2024-02-01 14:49:08.689733: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [41]:
# now creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
    embed_vector=word_vec_dict.get(word) #mapping the vector to word in our skipgram dictionary
    if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
        embed_matrix[i]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.

**Preparing the Data for Embedding Layer**

In [42]:
encd_rev

[[566, 10, 281, 49, 117, 190, 25],
 [18,
  178,
  2211,
  1209,
  3386,
  1056,
  377,
  23,
  713,
  1388,
  1209,
  1210,
  1,
  567,
  602,
  228,
  4,
  82,
  26,
  126],
 [31, 603, 1, 207],
 [1, 104, 3, 378, 3387, 2212],
 [20,
  2213,
  2214,
  3388,
  22,
  2,
  155,
  2213,
  532,
  1693,
  111,
  456,
  714,
  2215,
  3389],
 [1, 7, 179, 30],
 [8, 61, 86, 3390, 764, 1, 3, 512, 1, 765, 165, 1694, 44, 1211, 2216],
 [2, 69, 21, 1695, 1389],
 [1, 4, 140, 282],
 [1, 3, 133, 16, 3391, 3392, 2217, 21, 312, 350, 2218, 2219],
 [1, 3, 4, 208, 2220, 89, 66, 568],
 [1,
  98,
  16,
  74,
  31,
  16,
  74,
  407,
  434,
  1696,
  2221,
  13,
  651,
  192,
  3393,
  1697,
  2222,
  243,
  766,
  6,
  294,
  1698,
  84,
  341,
  1390],
 [2, 58, 90],
 [8, 1699, 2, 295, 166],
 [18, 2, 7, 715, 283, 13, 180, 313, 533, 3394],
 [1, 58, 109, 652, 604, 605, 1700, 314, 379, 4, 2223],
 [13, 569, 220, 487, 1212, 1, 3, 221, 172, 3395, 1391],
 [2, 101, 342, 35, 435, 181, 33],
 [1,
  767,
  1701,
  408,
  1

In [43]:
vocab_size

35000

In [44]:
from keras.preprocessing.sequence import pad_sequences
# now padding to have a maximum length of 64
pad_rev= pad_sequences(encd_rev, maxlen=max_sen_len, padding='post')
pad_rev.shape   # note that we had 4749 data and we have padded each review to have  a lenght of 64 words.

(4749, 64)

In [45]:
pad_rev

array([[ 566,   10,  281, ...,    0,    0,    0],
       [  18,  178, 2211, ...,    0,    0,    0],
       [  31,  603,    1, ...,    0,    0,    0],
       ...,
       [ 693,  640,  151, ...,    0,    0,    0],
       [   1,    3, 7767, ...,    0,    0,    0],
       [  18,    2,  198, ...,    0,    0,    0]], dtype=int32)

**Classification**

In [46]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten

In [1]:
model = keras.Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_sen_len,embeddings_initializer=Constant(embed_matrix)))
model.add(layers.LSTM(64, input_shape=(None, 28)))
model.add(layers.BatchNormalization())
model.add(layers.Dense(3))
print(model.summary())

NameError: name 'keras' is not defined

In [48]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer="sgd",
    metrics=["accuracy"],
)

In [49]:
model.fit(pad_rev, y_train, batch_size=2, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x168294710>

In [50]:
4749/2


2374.5

**Testing**

**Preparing  Test Data for Embedding Layer**

In [51]:
# Tokenize the words in the testing text data
word_token_test =[word_tokenize(i) for i in x_test]

In [52]:
word_token_test[0]

['i',
 'sometimes',
 'feel',
 'shy',
 'about',
 'my',
 'musical',
 'taste',
 'because',
 'some',
 'of',
 'it',
 'wanders',
 'towards',
 'what',
 'some',
 'might',
 'call',
 'techno',
 'slander']

In [53]:
# Remove stopwords from each sublist in word_token_test
filtered_tokens_test = [[word for word in sublist if word not in stop_words] for sublist in word_token_test]
# Display the first sublist of filtered tokens after removing stopwords
print(filtered_tokens_test[0])

['sometimes', 'feel', 'shy', 'musical', 'taste', 'wanders', 'towards', 'might', 'call', 'techno', 'slander']


In [54]:
tok.fit_on_texts(filtered_tokens_test)
encd_rev_test = tok.texts_to_sequences(filtered_tokens_test)

In [55]:
encd_rev_test

[[75, 1, 191, 2594, 1372, 7846, 229, 141, 289, 7847, 7848],
 [7849,
  3383,
  781,
  205,
  594,
  7850,
  1394,
  90,
  68,
  8,
  94,
  1680,
  26,
  1100,
  7851,
  140,
  299,
  1717,
  2008,
  41,
  5,
  593],
 [58, 1, 3, 1383, 266, 425, 150, 1175, 15, 2086, 3810],
 [2, 227, 659, 505, 91, 7852],
 [333, 2, 548, 336, 2548, 7853, 89, 2373, 7854, 2558, 7855, 3828, 1, 296],
 [6, 2545, 7856, 105, 8, 1, 192, 279, 2463, 419, 626],
 [893, 1655, 1, 607, 367, 434, 7857, 585, 7858, 1655],
 [311, 1, 714],
 [1, 37, 38, 1191, 54, 141, 882, 1, 217, 1488, 706, 49, 477, 835],
 [573, 2, 5, 511, 44, 884],
 [18, 2, 157, 108, 7859, 1802, 18, 1608, 33],
 [1, 1059, 1644, 283, 179],
 [1, 631, 353, 883, 214],
 [1, 134],
 [393, 1256, 273, 70, 2, 60, 104],
 [7860, 7861, 221, 25, 63, 1648, 1346, 7862, 53, 71, 1798, 1648, 1, 153],
 [1, 13, 210, 141, 2, 132, 1789, 27],
 [4, 432, 7863, 7864, 1, 4, 7865, 23, 303, 40],
 [1, 1015, 507, 1837, 681],
 [2, 520, 832],
 [4, 64, 1162, 2, 108],
 [2,
  293,
  108,
  172,
  

In [56]:
from keras.preprocessing.sequence import pad_sequences
# now padding to have a maximum length of 64
pad_rev_test= pad_sequences(encd_rev_test, maxlen=max_sen_len, padding='post')
pad_rev_test.shape   # note that we had 10 testing data and we have padded each review to have  a lenght of 64 words.

(1188, 64)

In [57]:
pad_rev_test

array([[  75,    1,  191, ...,    0,    0,    0],
       [7849, 3383,  781, ...,    0,    0,    0],
       [  58,    1,    3, ...,    0,    0,    0],
       ...,
       [   1, 2485,   72, ...,    0,    0,    0],
       [   1,    3,  232, ...,    0,    0,    0],
       [  43,   47,    2, ...,    0,    0,    0]], dtype=int32)

In [58]:
test_predict=model.predict(pad_rev_test)
classe_test=np.argmax(test_predict,axis=1)




In [59]:
classe_test

array([0, 0, 0, ..., 0, 0, 0])

In [60]:
print('\nClassification Report\n')
print(classification_report(y_test, classe_test))


Classification Report

              precision    recall  f1-score   support

           0       0.34      1.00      0.50       400
           1       0.00      0.00      0.00       400
           2       0.00      0.00      0.00       388

    accuracy                           0.34      1188
   macro avg       0.11      0.33      0.17      1188
weighted avg       0.11      0.34      0.17      1188

