### Data Downloading

Kaggle Set Up - Note this is only applicable for Google Colab
Skip if wish to use data from local machine

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files

In [None]:
kaggle_tok = files.upload()

In [None]:
!mkdir -p ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets list

In [None]:
!kaggle datasets download -d kosweet/cleaned-emotion-extraction-dataset-from-twitter

In [None]:
!unzip cleaned-emotion-extraction-dataset-from-twitter.zip -d dataset

### Beginning of Notebook once downloading above complete

In [None]:
import numpy as np
import pandas as pd
import os
import time

import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Flatten, Dense, Input, Dropout, MaxPooling1D, Concatenate
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import backend as K
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from datetime import datetime

import gensim

In [None]:
user_decision = int(input('Enter 1 to read from local machine or 2 from Kaggle'))
if user_decision == 1:
    path = 'C:/Users/cferr/Documents/4th Year/DL_Data/dataset(clean).csv'
    #path to glove download file
    glove_path = 'C:/Users/cferr/Documents/4th Year/DL_Data/'
else:
    path = '/content/dataset/dataset(clean).csv'
    glove_path=''

In [None]:
df = pd.read_csv(path, encoding = "ISO-8859-1")
pd.set_option('display.max_colwidth', None)

GloVe must be downloaded at every instance when running colab, usually takes approx 2 mins for download and this is followed by an unzipping

In [None]:
if(user_decision==2):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stopwordsdf = stopwords.words('english')

In [None]:
from nltk.stem import WordNetLemmatizer
def preprocess(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
     
        # Replace @USERNAME to ' '.
        tweet = re.sub(userPattern,' ', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if (len(word)>1 and word not in stopwordsdf):
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

In [None]:
t = time.time()
df['pre_proceessed'] = preprocess(df['Original Content'])
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

In [None]:
df.drop(['Original Content', 'Content'], axis=1,inplace=True)

In [None]:
df.head()

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=0, shuffle=True)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.pre_proceessed)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print("Vocabulary Size :", vocab_size)

In [None]:
x_train=tokenizer.texts_to_sequences(train.pre_proceessed)
x_test=tokenizer.texts_to_sequences(test.pre_proceessed)
pad_size = max(len(x) for x in x_train)

In [None]:
pad_size

In [None]:
t = time.time()
x_train = pad_sequences(x_train,
                        maxlen = pad_size, padding='post')
x_test = pad_sequences(x_test,
                       maxlen =pad_size,padding='post')

print("Training X Shape:",x_train.shape)
print("Testing X Shape:",x_test.shape)
print(f'Time Taken: {round(time.time()-t)} seconds')

In [None]:
encoder = LabelEncoder()

y_train = encoder.fit_transform(train.Emotion.to_list())
y_test = encoder.fit_transform(test.Emotion.to_list())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
def read_glove(glove_file):
  with open(glove_file, 'r', encoding="utf8") as f:
    words = set()
    word_to_vec_map = {}

    for line in f:
      line = line.strip().split()
      word = line[0]
      words.add(word)
      vec = line[1:]
      word_to_vec_map[word] = np.array(vec, dtype=np.float64)

  return  word_to_vec_map

In [None]:
t = time.time()
word_map = read_glove(glove_path+'glove.6B.300d.txt')
print(f'Time Taken: {round(time.time()-t)} seconds')

In [None]:
embedding_dim=300

In [None]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
  if word in word_map:
    embedding_matrix[i] = word_map[word]
print(embedding_matrix.shape)

In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print(int(100*nonzero_elements / vocab_size),'%')  

We can see from the above that our embedding matrix is a lot more useful that the other notebook which contains emojis but has an embedding matrix that is 42% empty, it will be interesting to see if a more useful emvedding matrix is enough to overcome the drop in sentiment power that comes with a loss of emojis

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                          embedding_dim,
                                          weights=[embedding_matrix],
                                          input_length=pad_size,
                                          trainable=False)

In [None]:
inp = Input(shape=pad_size, dtype='int32')
embedded_sequences = embedding_layer(inp)

In [None]:
checkpoint_filepath_cnn = '/cnnpp-weights-improvement-{epoch:02d}.hdf5'
checkpoint_filepath_lstmcnn = '/lstmcnnpp-weights-improvement-{epoch:02d}.hdf5'

## CNN

In [None]:
K.clear_session()

In [None]:
EPOCHS = 6

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath_cnn,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
conv_list = []
filter_sizes = [3,8]

for filt in filter_sizes:
    x = Conv1D(128, filt, activation='relu',padding='same')(embedded_sequences)
    x = MaxPooling1D(pool_size=2)(x)
    conv_list.append(x)
    
x = Concatenate(axis=-1)(conv_list)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(3, activation='sigmoid')(x)

model=Model(inp,output)
model.summary()

the increased vocab size means there is a bigger input being fed to the network, approx 3x bigger and so there are approx 3x more trainable params

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history_cnn = model.fit(x_train, y_train, batch_size=128, epochs=EPOCHS,
                    validation_split=0.1, callbacks=[model_checkpoint_callback])

In [None]:
model.save(datetime.now().strftime("%d_%m_%Y__%H_%M")+"_CNN_PRE"+".h5")

In [None]:
def plotting(history, name):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
#     lr =history.history['lr']
    epochs = range(len(acc))

    plt.plot(epochs, acc, 'b', label='Training acc')
    plt.plot(epochs, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy for ' + name)
    plt.legend()

    plt.figure()

    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss for '+ name)
    plt.legend()

    plt.show()

In [None]:
plotting(history_cnn, 'CNN Pre')

In [None]:
score = model.evaluate(x_test, y_test, verbose = 1) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

# LSTM-CNN

In [None]:
K.clear_session()

In [None]:
EPOCHS = 3

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath_lstmcnn,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
lstm = LSTM(32,dropout=0.2, return_sequences=True, recurrent_dropout=0.2)(embedded_sequences)

conv_list =[]
filter_sizes=[3,8]

for filt in filter_sizes:
    conv = Conv1D(128, filt, activation='relu')(lstm)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv_list.append(conv)
    
lstm_conv = Concatenate(axis=1)(conv_list) 
lstm_conv = Flatten()(lstm_conv)
lstm_conv = Dense(128, activation='relu')(lstm_conv)
lstm_conv = Dropout(0.5)(lstm_conv)
output = Dense(3, activation='sigmoid')(lstm_conv)
model_soa=Model(inp,output)
model_soa.summary()

In [None]:
model_soa.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model_soa.fit(x_train, y_train, batch_size=128, epochs=EPOCHS,
                    validation_split=0.1, callbacks=[model_checkpoint_callback])

In [None]:
model_soa.save(datetime.now().strftime("%d_%m_%Y__%H_%M")+"_LSTM_CNN_PRE"+".h5")

In [None]:
plotting(history, 'LSTM-CNN Pre')

In [None]:
score = model_soa.evaluate(x_test, y_test, verbose = 1) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

### Evaluate Models

In [None]:
model_cnn_pre =load_model('24_04_2021__20_47_CNN_PRE.h5')
model_lstmcnn_pre=load_model('25_04_2021__21_01_LSTM_CNN_PRE.h5')

In [None]:
print('CNN With Custom Pre-Process:')
score_cnn_pre = model_cnn_pre.evaluate(x_test, y_test, verbose = 1)
print('LSTM-CNN With Custom Pre-Process:')
score_lstmcnn_pre = model_lstmcnn_pre.evaluate(x_test, y_test, verbose = 1)

#### Creating dataframe for barchart in other notebook

In [None]:
df =pd.DataFrame()

In [None]:
df['CNN_Pre'] = score_cnn_pre
df['LSTM_CNN_PRE'] = score_lstmcnn_pre

In [None]:
df.rename(index = {0:'Loss', 1:'Accuracy'}, inplace=True)

In [None]:
df.to_csv('Test_Results_pre.csv')

## Interesting Results with model

In [None]:
model =load_model('25_04_2021__21_01_LSTM_CNN_PRE.h5',compile=True)

In [None]:
labels_dict = {0:'angry',1:'dissapointed',2:'happy'}

In [None]:
sample_string = ['Haha life is so fun and enjoyable!', 'Haha life is so fun and enjoyable Donald Trump!']

In [None]:
def prediction_out(sample_string, pad_size):
    sample_string = tokenizer.texts_to_sequences(sample_string)

    sample_string = pad_sequences(sample_string,
                            maxlen = pad_size, padding='post')
    predictions = model.predict(sample_string)
    classes = np.argmax(predictions, axis = 1)
    for elem in classes:
        print(labels_dict[elem])

In [None]:
prediction_out(sample_string, pad_size)