<a href="https://colab.research.google.com/github/elooo3/Masters-NLP---B620035/blob/main/anger%2C_joy%2C_fear%2C_sadness_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [None]:
dataset = pd.read_excel('Dataset.xlsx', sheet_name = 'isear')

In [None]:
dataset['Label'] = dataset['Label'].replace([5,6,7],0)

In [None]:
dataset

Unnamed: 0,Review,Emotion,Label
0,"During the period of falling in love, each tim...",joy,1
1,When I was involved in a traffic accident.,fear,2
2,When I was driving home after several days of...,anger,3
3,When I lost the person who meant the most to me.,sadness,4
4,The time I knocked a deer down - the sight of ...,disgust,0
...,...,...,...
7506,Two years back someone invited me to be the tu...,anger,3
7507,I had taken the responsibility to do something...,sadness,4
7508,I was at home and I heard a loud sound of spit...,disgust,0
7509,I did not do the homework that the teacher had...,shame,0


In [None]:
dataset['Label'].value_counts()

0    3176
1    1088
3    1084
2    1083
4    1080
Name: Label, dtype: int64

In [None]:
train_data = dataset

In [None]:
train_data

Unnamed: 0,Review,Emotion,Label
0,"During the period of falling in love, each tim...",joy,1
1,When I was involved in a traffic accident.,fear,2
2,When I was driving home after several days of...,anger,3
3,When I lost the person who meant the most to me.,sadness,4
4,The time I knocked a deer down - the sight of ...,disgust,0
...,...,...,...
7506,Two years back someone invited me to be the tu...,anger,3
7507,I had taken the responsibility to do something...,sadness,4
7508,I was at home and I heard a loud sound of spit...,disgust,0
7509,I did not do the homework that the teacher had...,shame,0


In [None]:
data = train_data

In [None]:
data.drop(["Emotion"],
          axis=1,
          inplace=True)

In [None]:
data.tail()

Unnamed: 0,Review,Label
7506,Two years back someone invited me to be the tu...,3
7507,I had taken the responsibility to do something...,4
7508,I was at home and I heard a loud sound of spit...,0
7509,I did not do the homework that the teacher had...,0
7510,I had shouted at my younger brother and he was...,0


In [None]:
data_labels = data.Label.values

In [None]:
data_clean = [] 
for i in range(0, 7511): # where 7511 is the number of sentences in the dataset 
#  store data in review and update after every cleaning process
  review = re.sub(r"[^a-zA-Z.!?']", ' ', data['Review'][i]) # re subfunction to replace any element that is not a letter with a space
  review = review.lower() # transform all capitals to lowercase letters 

  data_clean.append(review) 

In [None]:
data_clean

['during the period of falling in love  each time that we met and   especially when we had not met for a long time.',
 'when i was involved in a traffic accident.',
 'when i was driving home after  several days of hard work  there   was a motorist ahead of me who was driving at    km hour and   refused  despite his low speeed to let me overtake.',
 'when i lost the person who meant the most to me. ',
 "the time i knocked a deer down   the sight of the animal's   injuries and helplessness.  the realization that the animal was   so badly hurt that it had to be put down  and when the animal   screamed at the moment of death.",
 'when i did not speak the truth.',
 'when i caused problems for somebody because he could not keep the   appointed time and this led to various consequences.',
 'when i got a letter offering me the summer job that i had applied   for.  ',
 'when i was going home alone one night in paris and a man came up   behind me and asked me if i was not afraid to be out alone 

In [None]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [None]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [None]:
test_idx = np.random.randint(0, 7511, 1800)
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.fivegram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=5,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes + 1,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        x_4 = self.fivegram(x)
        x_4 = self.pool(x_4)

        merged = tf.concat([x_1, x_2, x_3,x_4], axis=-1) # (batch_size, 3 * nb_filters)
        #merged = tf.concat([x_1, x_2], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [None]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 300
NB_FILTERS = 60
FFN_UNITS = 180
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.1

BATCH_SIZE = 25
NB_EPOCHS = 10

In [None]:
len(set(train_labels))

5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f131fe02b10>

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

[1.4262055158615112, 0.6549999713897705]


In [None]:
y_pred = Dcnn.predict(test_inputs)
y_pred_class = np.argmax(y_pred, axis=1)
from sklearn.metrics import confusion_matrix
#y_pred = Dcnn.predict(test_inputs)
y_test_class = np.argmax(test_labels, axis=0)
y_test_class = y_test_class.reshape(-1,1)
y_pred_class  = y_pred_class.reshape(-1,1)
confusion_matrix(test_labels, y_pred_class)

array([[548,  44,  40,  86,  57],
       [ 31, 192,   7,  12,  15],
       [ 43,  13, 166,   6,  10],
       [112,  14,  11, 110,  19],
       [ 50,  23,  14,  14, 163]])

In [None]:
from sklearn.metrics import classification_report
report = classification_report(test_labels, y_pred_class)
print(report)

              precision    recall  f1-score   support

           0       0.70      0.71      0.70       775
           1       0.67      0.75      0.71       257
           2       0.70      0.70      0.70       238
           3       0.48      0.41      0.45       266
           4       0.62      0.62      0.62       264

    accuracy                           0.66      1800
   macro avg       0.63      0.64      0.63      1800
weighted avg       0.65      0.66      0.65      1800



# Dealing with imbalance

In [None]:
# Undersampling method 
from collections import Counter
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids()

ns=NearMiss(0.8)
X_ns,y_ns=cc.fit_resample(data_inputs,data_labels)

  self.estimator_.fit(X[y == target_class])
  self.estimator_.fit(X[y == target_class])


In [None]:
print("The number of classes before fit {}".format(Counter(data_labels)))
print("The number of classes after fit {}".format(Counter(y_ns)))

The number of classes before fit Counter({0: 3176, 1: 1088, 3: 1084, 2: 1083, 4: 1080})
The number of classes after fit Counter({0: 1080, 1: 1080, 2: 1080, 3: 1080, 4: 1080})


In [None]:
# train and test split 80:20
test_idx = np.random.randint(0, 5400, 1350)
test_inputs = X_ns[test_idx]
test_labels = y_ns[test_idx]
train_inputs = np.delete(X_ns, test_idx, axis=0)
train_labels = np.delete(y_ns, test_idx)

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.fivegram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=5,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes + 1,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        x_4 = self.fivegram(x)
        x_4 = self.pool(x_4)

        merged = tf.concat([x_1, x_2, x_3,x_4], axis=-1) # (batch_size, 3 * nb_filters)
        #merged = tf.concat([x_1, x_2], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [None]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 300
NB_FILTERS = 60
FFN_UNITS = 180
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.1

BATCH_SIZE = 25
NB_EPOCHS = 10

In [None]:
len(set(train_labels))

5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f13041a37d0>

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

[1.2731051445007324, 0.6903703808784485]


In [None]:
y_pred = Dcnn.predict(test_inputs)
y_pred_class = np.argmax(y_pred, axis=1)
from sklearn.metrics import confusion_matrix
#y_pred = Dcnn.predict(test_inputs)
y_test_class = np.argmax(test_labels, axis=0)
y_test_class = y_test_class.reshape(-1,1)
y_pred_class  = y_pred_class.reshape(-1,1)
confusion_matrix(test_labels, y_pred_class)

array([[142,  14,  26,  59,  16],
       [  9, 235,  19,   9,  15],
       [ 11,  28, 198,  28,  12],
       [ 16,  20,  21, 191,  12],
       [  6,  33,  17,  47, 166]])

In [None]:
from sklearn.metrics import classification_report
report = classification_report(test_labels, y_pred_class)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.55      0.64       257
           1       0.71      0.82      0.76       287
           2       0.70      0.71      0.71       277
           3       0.57      0.73      0.64       260
           4       0.75      0.62      0.68       269

    accuracy                           0.69      1350
   macro avg       0.70      0.69      0.69      1350
weighted avg       0.70      0.69      0.69      1350

