# Installation and Imports

The first thing I'm goint to do is to install the packages that i I need

In [None]:
!pip install keras-self-attention
!pip install tensorflow_addons
!pip install wget



Then I Import all the main libraries

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import wget
import wget
import nltk
from tensorflow import keras
import tensorflow_addons as tfa
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('punkt')
nltk.download('stopwords')

# Import of the Datasets

In [None]:
test_set= pd.read_csv('test_ekmann.csv')
train_set= pd.read_csv('train_ekmann.csv')
val_set= pd.read_csv('val_ekmann.csv')

# Pre-Processing

In this case we have in input text sentences.

I adopted a basic strategy, so in order to clean as much as I could the dataset, I did the following operations:

- I edited everything in lower case
- I removed all the emoji 
- I removed the punctuation (except question marks and exclamation points)
- I replaced the question marks with the word "question" and the exclamation points with "esclamation"
- I removed all the numbers
- I removed all the stopwords 

Initially I also tried using stemming and lemming techniques but they worsened the performance of the model, so I removed them

## Preprocess of text

In [4]:
def preprocess_text(df:pd.DataFrame) -> pd.DataFrame:

  '''
  Preprocess function for the input text, it takes
  in input the original dataframe and manipulate it 
  into a cleaner dataframe

  input: original dataframe
  output: processed dataframe
  '''

  # edit in lower case
  df['Text']=df['Text'].str.lower()

  # remove emoji
  emoji_filter = lambda c: ord(c) < 256
  df['Text'] = df['Text'].apply(lambda s: ''.join(filter(emoji_filter, s)))

  # remove punctuation (except "?" and "!" )
  train_set['Text'] = train_set['Text'].str.replace(r'[^\w\s\?\!]+', '')

  # replace "?" with 'question' and "!" with 'esclamation'
  df['Text'] = df['Text'].str.replace('?',' question')
  df['Text'] = df['Text'].str.replace('!',' esclamation') 

  # remove numbers
  df['Text'] = df['Text'].str.replace('\d+', '')

  # remove stopwords
  stop_words = set(stopwords.words('english'))
  df['Text'] = df['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

  return df

In [None]:
train_df=preprocess_text(train_df)
val_df=preprocess_text(val_df)
test_df=preprocess_text(test_df)

## Preprocessing of labels

For the labels (the "Emotion" column of the dataframe) I just applied the LabelEncoder

In [None]:
def using_to_categorical(doc):
    label_encoder = LabelEncoder()
    data = label_encoder.fit_transform(doc)
    data = np.array(data)
    encoded = to_categorical(data)
    return encoded

In [None]:
y_train=using_to_categorical(train_set['Emotion'])
y_val=using_to_categorical(val_set['Emotion'])
y_test=using_to_categorical(test_set['Emotion'])

# Word Embedding

For my model I decided to use a pre-trained embedded layer

I used an unsupervised learning algorithm for obtaining vector representations for words called GloVe

This dataset contains English word vectors pre-trained and there are from 25 up to 200 dimensional pre trained word vectors. I decided to use the 200 dimentional word vector




First I have to use a tokenizer to the sentences in order to separate all the words and assign them a number

Then I use pad_sequences to make all sentences the same length

In [None]:
max_length=200 #I choose 200 because as we will see later I will use a dataset containing 200-dimention word vectors

train_sentences=np.asarray(train_set['Text'])
val_sentences=np.asarray(val_set['Text'])
test_sentences=np.asarray(test_set['Text'])

tokenizer=Tokenizer()
tokenizer.fit_on_texts(train_sentences)
vocab_size=len(tokenizer.word_index)+1

train_encoded_document=tokenizer.texts_to_sequences(train_sentences)
x_train=pad_sequences(train_encoded_document,maxlen=max_length,padding='pre')

val_encoded_document=tokenizer.texts_to_sequences(val_sentences)
x_val=pad_sequences(val_encoded_document,maxlen=max_length,padding='pre')


test_encoded_document=tokenizer.texts_to_sequences(test_sentences)
x_test=pad_sequences(test_encoded_document,maxlen=max_length,padding='pre')

Download of the dataset for the word embedding 

In [None]:
!wget https://nlp.stanford.edu/data/glove.twitter.27B.zip
l!unzip -q glove.twitter.27B.zip

--2022-06-30 12:21:32--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2022-06-30 12:21:33--  https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [application/zip]
Saving to: ‘glove.twitter.27B.zip’


2022-06-30 12:26:20 (5.05 MB/s) - ‘glove.twitter.27B.zip’ saved [1520408563/1520408563]



Load all the word vectors in it

In [None]:
embedding_index=dict()
f=open('glove.twitter.27B.200d.txt')
for line in f:
  values=line.split()
  word=values[0]
  coefs=np.asarray(values[1:],dtype='float32')
  embedding_index[word]=coefs
f.close()
print('Leaded word vectors = ', len(embedding_index))

Leaded word vectors =  1193514


Create the embedding_matrix that contains the pre-trained weigths for the embedded layer that I will use in my model

In [None]:
embedding_matrix=np.zeros((vocab_size,max_length))
for word,i in tokenizer.word_index.items():
  embedding_vector=embedding_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i]=embedding_vector

len(embedding_matrix),embedding_matrix.size

(27161, 5432200)

# Model

First I import the model and all the layers I'm going to use

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import MaxPool1D
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalMaxPool1D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from keras_self_attention import SeqSelfAttention

## Implementation of the Model

For the implementation of the neural nerwork I used 15 layers with a total number of about 6 milion of parameters

- First I added the **Input layer**

- After the input layer I inserted the pre-trained **Embedded layer**; in input of this layer I specified the size of the vocabolary, the length of the sentences and the embedding_matrix (containing the pre-trained weigths)

- Then I decided to insert two **Bidirectional layers**, as instance I used the LSTM layer(I also tried with GRU layer but LSTM worked better). I inserted two of these layer because I saw that it worked very well on my model

- I also added an **Attention Layer**, more precisely the SeqSelfAttention layer (I also tried the keras Attention layer but I had many problem using it and place it correctly in the model). After some researches I have read a paper ("Attention is all you need"[1]) emphasised the importance of the attention layer in a NLP network so I implemented it

- After the attention layer I added:
 - A **Convolutional Layer** (it creates a convolution kernel that is convolved with the layer input over a single spatial dimension to produce a tensor of outputs). It has been usefull to connect efficiently the attention layer with the Max Pooling Layer
 - A **Max Pooling Layer 1D** (in order to apply the Max pooling operation)
 - A **DropOut Layer** (helps to prevent overfitting)
 - A **Concatenate Layer** (I concatenated the Bidirectional and the Attention Layer in order to consider the whole context and calculate the relevance)
 - A **Global Max Pooling Layer**
- The configuration of the last 4 layers was inspired by a NLP network implemented by Polignano, M., Basile, P., de Gemmis, M., & Semeraro, G. in 2019 in their paper [2]. Expecially the Concatenate layer proved to be extremely high-performance in the model
- Finally I added some **Dense** layers that increased the performance. The number of dense layer and neuron for each layer has been choosen after several trials. With fewer parameters the model performed worse, while with higher parameters the model overfitted very soon. This is the combination that gave me the best results.


I did several attempt with many different layers and layer configuration. After all I reached the best score (I took as a reference point the f1_score) using the following configuration

In [None]:
input_layer = Input((max_length))

l_1 = Embedding( vocab_size , max_length , weights = [embedding_matrix] , input_length = max_length , trainable=False)(input_layer)
l_2 = Bidirectional(LSTM(32,return_sequences=True,dropout=0.3, activation='tanh'))(l_1)
l_3 = Bidirectional(LSTM(32,return_sequences=True,dropout=0.3, activation='tanh'))(l_2)
l_4 = SeqSelfAttention(attention_type=SeqSelfAttention.ATTENTION_TYPE_MUL,
                        kernel_regularizer=keras.regularizers.l2(1e-4),
                        bias_regularizer=keras.regularizers.l1(1e-4),
                        attention_regularizer_weight=1e-4,
                        attention_width=15,
                        name='Attention')(l_3)

l_5 = Conv1D(512, 3, activation='relu')(l_4)
l_6 = MaxPool1D()(l_5)
l_7 = Dropout(0.2)(l_6)
l_8 = Concatenate(axis=1)([l_3,l_4])
l_9 = GlobalMaxPool1D()(l_8)
l_10 = Dense(1024,activation='relu')(l_9)
l_11 = Dropout(0.2)(l_10)
l_12 = Dense(512,activation='relu')(l_11)
l_13 = Dropout(0.2)(l_12)
l_14 = Dense(128,activation='relu')(l_13)
l_15 = Dense(7,activation='softmax')(l_14)

model = Model(inputs = input_layer, outputs = l_15)

In [None]:
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 200)]        0           []                               
                                                                                                  
 embedding_25 (Embedding)       (None, 200, 200)     5432200     ['input_13[0][0]']               
                                                                                                  
 bidirectional_51 (Bidirectiona  (None, 200, 64)     59648       ['embedding_25[0][0]']           
 l)                                                                                               
                                                                                                  
 bidirectional_52 (Bidirectiona  (None, 200, 64)     24832       ['bidirectional_51[0][0]'] 

## Model Compilation

For the compilation I tried to use the fine tuning technique on the pre-trained embedded layer, so I compiled and trained the model for 2 epochs (with the pre-trained embedded layer setted trainable), then I saved the weigths.Finally I setted the layer non-trainable, I loaded the new weights and I compiled and trained again. Unfortunately this thecnique worsened the performance of my model so I decided to remove it and I opted for a basic compilation

I used the Categorical Crossentropy as loss, because It is a classification task and as metrics the F1Score

In [None]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=tfa.metrics.F1Score(num_classes = 7, average='macro')
)

I used callbacks in order to stop the training when the model goes in overfitting

In [None]:
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=4),
    tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
]

In [None]:
model.fit(x_train,y_train,epochs=20,batch_size=32,validation_data=(x_val,y_val),callbacks=my_callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


<keras.callbacks.History at 0x7f7883dd5290>

## Evaluation of the Model

For the evaluation, as I sai before, I used as reference point the F1 Score

In [None]:
from sklearn import metrics

y_pred=np.argmax(model.predict(x_test),axis=1)
y_true=np.argmax(y_test,axis=1)

print(metrics.classification_report(y_pred,y_true))

              precision    recall  f1-score   support

           0       0.42      0.48      0.44       501
           1       0.42      0.61      0.50        80
           2       0.56      0.83      0.67        54
           3       0.78      0.77      0.78      2000
           4       0.66      0.59      0.63      1838
           5       0.51      0.62      0.56       293
           6       0.57      0.59      0.58       661

    accuracy                           0.65      5427
   macro avg       0.56      0.64      0.59      5427
weighted avg       0.66      0.65      0.65      5427



In [None]:
from sklearn.metrics import f1_score

print(f1_score(y_true, y_pred, average='macro'))

0.5929419705723287


## Final result
### With this configuration I reached a F1 Score equals to 0,5929

# References

[1]  Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.

[2]  Polignano, M., Basile, P., de Gemmis, M., & Semeraro, G. (2019, June). A comparison of word-embeddings in emotion detection from text using bilstm, cnn and self-attention. In Adjunct Publication of the 27th Conference on User Modeling, Adaptation and Personalization (pp. 63-68).