### Data Downloading

Kaggle Set Up - Note this is only applicable for Google Colab
Skip if wish to use data from local machine

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files

In [None]:
kaggle_tok = files.upload()

In [None]:
!mkdir -p ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets list

In [None]:
!kaggle datasets download -d kosweet/cleaned-emotion-extraction-dataset-from-twitter

In [None]:
!unzip cleaned-emotion-extraction-dataset-from-twitter.zip -d dataset

### Beginning of Notebook once downloading above complete

In [None]:
import numpy as np
import pandas as pd
import os
import time

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Flatten, Dense, Input, Dropout, MaxPooling1D, Concatenate
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import backend as K
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from datetime import datetime
import gensim

In [None]:
user_decision = int(input('Enter 1 to read from local machine or 2 from Kaggle'))
if user_decision == 1:
    path = 'C:/Users/cferr/Documents/4th Year/DL_Data/dataset(clean).csv'
    #path to glove download file
    glove_path = 'C:/Users/cferr/Documents/4th Year/DL_Data/'
else:
    path = '/content/dataset/dataset(clean).csv'
    glove_path=''

In [None]:
df = pd.read_csv(path, encoding = "ISO-8859-1")
pd.set_option('display.max_colwidth', None)

In [None]:
if(user_decision==2):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip

## <font color=purple> Exploratory Data Analysis / Pre-Processing

In [None]:
df.head()

In [None]:
df.tail()

One of the first things I'm noticing is the 'Content' column does some work in getting the original tweet cleaned up.<br>It removes accounts tagged with '@', it appears to remove all occurences of commas and apostrophes, it removes attached links and finally it alters the use of emojis to make them into single words e.g. 'facewithtearsofjoy'

In [None]:
missing_data = df.isna().sum().sort_values(ascending=False)
percentage_missing = round((df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)*100,2)
missing_info = pd.concat([missing_data,percentage_missing],keys=['Missing values','Percentage'],axis=1)
missing_info

The above is a good sign, it is an indication that we ae dealing with a complete dataset and do not have to worry about dealing with empty columns or anything of the sort

In [None]:
counts = df.groupby(by='Emotion').agg('count')
counts

Lets visualise this a little better

In [None]:
fig = plt.figure(figsize=(8,8))
counts['Content'].plot(kind='bar', subplots=True, figsize=(10, 8))
plt.title("Pie chart of different classes of tweets",fontsize=16)
plt.ylabel("")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(8,8))
counts['Content'].plot(kind='pie', subplots=True, figsize=(10, 8), autopct='%1.1f%%')
plt.title("Pie chart of different classes of tweets",fontsize=16)
plt.ylabel("")
plt.legend()
plt.show()


So we can see clearly that there is a very even split between the different emotion classes

In [None]:
df.drop(['Original Content'], axis=1,inplace=True)

## Next Steps

In [None]:
df

In [None]:
lengths = df.Content.str.split().apply(len)

#### Split into testing and training sets

As per the assignment specifications the data should be shuffled with random seed=0, take last 20% of data for testing

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=0, shuffle=True)

In [None]:
print('Train dataset shape: {}'.format(train.shape))
print('Test dataset shape: {}'.format(test.shape))

In [None]:
t = time.time()
documents = [_text.split() for _text in df.Content] 
print(f'Time Taken: {round(time.time()-t)} seconds')

#### Tokenisation

In [None]:
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(train.Content)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print("Vocabulary Size :", vocab_size)

In [None]:
x_train=tokenizer.texts_to_sequences(train.Content)
x_test=tokenizer.texts_to_sequences(test.Content)
pad_size = max(len(x) for x in x_train)

In [None]:
pad_size

A neural network needs to recieve inputs of a stable length. Given that tweets can range from 1-280 characters means there is quite a bit of variation in the length of our tweets, we therefore must apply something called padding, this is the process of lenghtening each tweets vector representation by the addition of zeros, these zeros mean that the data itself is not compromised but allows for a consistent input into the network.<br> keras employ a useful tool for such an action called pad_sequences

In [None]:
t = time.time()
x_train = pad_sequences(x_train,
                        maxlen = pad_size, padding='post')
x_test = pad_sequences(x_test,
                       maxlen =pad_size,padding='post')

print("Training X Shape:",x_train.shape)
print("Testing X Shape:",x_test.shape)
print(f'Time Taken: {round(time.time()-t)} seconds')

In [None]:
labels=counts.index.unique()
labels=labels.tolist()

In [None]:
labels

In [None]:
encoder = LabelEncoder()

y_train = encoder.fit_transform(train.Emotion.to_list())
y_test = encoder.fit_transform(test.Emotion.to_list())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
np.unique(y_train)

### Using GloVe - Transfer Learning

In [None]:
def read_glove(glove_file):
  with open(glove_file, 'r', encoding="utf8") as f:
    words = set()
    word_to_vec_map = {}

    for line in f:
      line = line.strip().split()
      word = line[0]
      words.add(word)
      vec = line[1:]
      word_to_vec_map[word] = np.array(vec, dtype=np.float64)

  return  word_to_vec_map

In [None]:
word_map = read_glove(glove_path+'glove.6B.300d.txt')

### Embedding Layer

In [None]:
embedding_dim=300

In [None]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
  if word in word_map:
    embedding_matrix[i] = word_map[word]
print(embedding_matrix.shape)

Because we are dealing with public twitter data there is a high likelyhood of misspellings therefore we can see from the below that 42% of our embedding matrix is going to remain filled with zeros

In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print(int(100*nonzero_elements / vocab_size),'%')    

In [None]:
embedding_matrix.shape[1]

In [None]:
vocab_size

In [None]:
x_train.shape

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                          embedding_dim,
                                          weights=[embedding_matrix],
                                          input_length=pad_size,
                                          trainable=False)

In [None]:
inp = Input(shape=pad_size, dtype='int32')

In [None]:
embedded_sequences = embedding_layer(inp)

# Models

#### Shared Variables

In [None]:
BATCH_SIZE =128
EPOCHS =3

### CNN

A reproducement of Kim Yoons CNN for sentence classification

In [None]:
K.clear_session()

In [None]:
conv_list = []
filter_sizes = [3,8]

for filt in filter_sizes:
    x = Conv1D(128, filt, activation='relu',padding='same')(embedded_sequences)
    x = MaxPooling1D(pool_size=2)(x)
    conv_list.append(x)
    
x = Concatenate(axis=-1)(conv_list)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(3, activation='sigmoid')(x)

model=Model(inp,output)
model.summary()

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
ReduceLROnPlateau = ReduceLROnPlateau(factor=0.1,
                                     min_lr = 0.01,
                                     monitor = 'val_loss',
                                     verbose = 1)

In [None]:
history_cnn = model.fit(x_train, y_train, batch_size=128, epochs=3,
                    validation_split=0.1, callbacks=[ReduceLROnPlateau])

In [None]:
model.save(datetime.now().strftime("%d_%m_%Y__%H_%M")+"_CNN"+".h5")

In [None]:
def plotting(history, name):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
#     lr =history.history['lr']
    epochs = range(len(acc))

    plt.plot(epochs, acc, 'b', label='Training acc')
    plt.plot(epochs, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy for ' + name)
    plt.legend()

    plt.figure()

    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss for '+ name)
    plt.legend()

    plt.show()

In [None]:
plotting(history_cnn, 'CNN Orig')

In [None]:
model_cnn =load_model('18_04_2021__14_29_CNN.h5')

In [None]:
score = model_cnn.evaluate(x_test, y_test, verbose = 1) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

# LSTM - CNN model

In [None]:
K.clear_session()

In [None]:
inp = Input(shape=pad_size, dtype='int32')
embedded_sequences = embedding_layer(inp)

In [None]:
lstm = LSTM(32,dropout=0.2, return_sequences=True, recurrent_dropout=0.2)(embedded_sequences)

conv_list =[]
filter_sizes=[3,8]

for filt in filter_sizes:
    conv = Conv1D(128, filt, activation='relu')(lstm)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv_list.append(conv)
    
lstm_conv = Concatenate(axis=1)(conv_list) 
lstm_conv = Flatten()(lstm_conv)
lstm_conv = Dense(128, activation='relu')(lstm_conv)
lstm_conv = Dropout(0.5)(lstm_conv)
output = Dense(3, activation='sigmoid')(lstm_conv)
model_soa=Model(inp,output)
model_soa.summary()

In [None]:
model_soa.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history_lstmcnn = model_soa.fit(x_train, y_train, batch_size=128, epochs=3,
                    validation_split=0.1, callbacks=[ReduceLROnPlateau])

In [None]:
plotting(history_lstmcnn, 'LSTM-CNN Orig')

In [None]:
model_soa.save(datetime.now().strftime("%d_%m_%Y__%H_%M")+"_LSTM_CNN"+".h5")

In [None]:
score = model_soa.evaluate(x_test, y_test, verbose = 1) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

## Evaluation

In [None]:
model_cnn=load_model('18_04_2021__14_29_CNN.h5')
model_lstm_cnn=load_model('22_04_2021__16_01_LSTM_CNN.h5')

In [None]:
print('CNN With Original Pre-Process:')
score_cnn = model_cnn.evaluate(x_test, y_test, verbose = 1)
print('LSTM-CNN With Original Pre-Process:')
score_lstm_cnn = model_lstm_cnn.evaluate(x_test, y_test, verbose = 1) 

## Evaluation Comparison Across 4 models

In [None]:
df=pd.DataFrame()

In [None]:
df['CNN_ORIG'] = score_cnn

In [None]:
df['LSTM_CNN_ORIG'] = score_lstm_cnn

In [None]:
df.rename(index = {0:'Loss', 1:'Accuracy'}, inplace=True)

In [None]:
df

In [None]:
df.to_csv('Test_Results_orig.csv')

In [None]:
df_orig = pd.read_csv('Test_Results_orig.csv')
df_pre = pd.read_csv('Test_Results_pre.csv')

In [None]:
df_orig.drop('Unnamed: 0',axis=1, inplace=True)

In [None]:
df_pre.drop('Unnamed: 0',axis=1, inplace=True)

In [None]:
groupedBarAccuracyData = np.array([["CNN", "Preprocessed", df_pre['CNN_Pre'][1]],
                                ["CNN","Original",df_orig['CNN_ORIG'][1]], 
                                ["LSTM_CNN", "Original", df_orig['LSTM_CNN_ORIG'][1]],
                                ["LSTM_CNN", "Preprocessed", df_pre['LSTM_CNN_PRE'][1]]
                               ])

In [None]:
groupedBarLossData = np.array([["CNN", "Preprocessed", df_pre['CNN_Pre'][0]],
                                ["CNN","Original",df_orig['CNN_ORIG'][0]], 
                                ["LSTM_CNN", "Original", df_orig['LSTM_CNN_ORIG'][0]],
                                ["LSTM_CNN", "Preprocessed", df_pre['LSTM_CNN_PRE'][0]]
                               ])

In [None]:
groupedBarAccuracyDataFrame = pd.DataFrame(groupedBarAccuracyData, columns = ["Model", "Data", "Accuracy"])

In [None]:
groupedBarLossDataFrame = pd.DataFrame(groupedBarLossData, columns = ["Model", "Data", "Loss"])

In [None]:
sns.set(style="whitegrid")

g = sns.catplot(x="Data", y="Loss", hue="Model", data=groupedBarLossDataFrame,
                height=6, kind="bar", palette="bright")
g.despine(left=True)
g.set_ylabels("Loss\n", fontsize = 16)
g.set_xlabels("Dataset", fontsize = 16)

In [None]:
groupedBarAccuracyDataFrame

In [None]:
sns.set(style="whitegrid")

g = sns.catplot(x="Data", y="Accuracy", hue="Model", data=groupedBarAccuracyDataFrame,
                height=6, kind="bar", palette="bright")
g.despine(left=True)
g.set_ylabels("Accuracy\n", fontsize = 16)
g.set_xlabels("Dataset", fontsize = 16)