In [1]:
# Import libraries and modules
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
import keras
import string
import nltk
from autocorrect import Speller
import re
import datetime, os


In [95]:
data = pd.read_csv('datasets/dataset_complete_balanced.csv', encoding='utf8')
print(len(data))
data.head()

19107


Unnamed: 0,emotion,text
0,anger,he looks offended i tell perceived man man anger
1,anger,we school room audiovisual classes we watchin...
2,anger,oh jim i told no pets it make mess house
3,anger,getting terrible meals paying heaps
4,anger,when drinking glass water i found cockroach wa...


In [96]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

data = data.sample(frac=1,random_state=42)
data = data[['text','emotion']]
labels = data.emotion
data.emotion = LabelEncoder().fit_transform(data.emotion)
data.head()

Unnamed: 0,text,emotion
11632,you late,6
9380,hey mary,4
4518,the house fire i saved dog i afraid fire verg...,2
6388,making sexual pass cousin i close personal re...,3
1443,us banks also lost heavily furious bundesbank ...,0


In [97]:
keys = data['emotion'] 
values = labels 

emotion_label_dict = dict(zip(keys, values))
emotion_label_dict

{6: 'neutral',
 4: 'happy',
 2: 'fear',
 3: 'guilt',
 0: 'anger',
 1: 'disgust',
 7: 'sad',
 8: 'shame',
 5: 'joy',
 9: 'suprise'}

In [4]:
test_dataframe = data.sample(frac=0.2, random_state=1337)
train_dataframe = data.drop(test_dataframe.index)

val_dataframe = train_dataframe.sample(frac=0.25, random_state=1337)
train_dataframe = train_dataframe.drop(val_dataframe.index)


print("Using %d samples for training and %d for validation and %d for test" % (len(train_dataframe), len(val_dataframe),len(test_dataframe)))

Using 11464 samples for training and 3822 for validation and 3821 for test


In [5]:
def dataframe_to_dataset(dataframe):
    data_copy = data.copy()
    labels = data_copy.pop("")
    ds = tf.data.Dataset.from_tensor_slices((dict(data_copy), labels))
    ds = ds.shuffle(buffer_size=len(data_copy))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)
test_ds = dataframe_to_dataset(test_dataframe)

In [6]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'text': <tf.Tensor: shape=(), dtype=string, numpy=b'since war began soviet military commentators including senior serving officers expressing dismay seeing iraq army whose tanks aircraft supplied pounded relentlessly effectively fear '>}
Target: tf.Tensor(2, shape=(), dtype=int32)


In [7]:
train_ds = train_ds.shuffle(10000).batch(1024)
val_ds = val_ds.shuffle(10000).batch(1024)
test_ds = test_ds.shuffle(10000).batch(1024)

In [8]:

# Create embedding layer
#hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string)
model_new = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1", dtype=tf.string, input_shape=[], output_shape=[128]),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

model_new.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 128)               124642688 
_________________________________________________________________
dense (Dense)                (None, 256)               33024     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2

In [9]:
%load_ext tensorboard
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

tensorboard_cb = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)


In [10]:
from keras import callbacks


# Exponantial Learning Rate Scheduler function
def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr*tf.math.exp(-0.1)

# Define early stopping to prevent overfit
earlystopping = callbacks.EarlyStopping(monitor='val_accuracy',
                                        mode='max',
                                        patience=10,
                                        restore_best_weights=True,
                                        verbose=1)

# Define lr scheduler to prevent overfit
lr_scheduler = callbacks.LearningRateScheduler(scheduler, verbose=1)

In [11]:
model_new.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(lr=0.006),
              metrics=['accuracy'])

history = model_new.fit(train_ds, epochs=100, validation_data=val_ds, callbacks = [tensorboard_cb])

%tensorboard --logdir logs

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Reusing TensorBoard on port 6006 (pid 17832), started 21:16:39 ago. (Use '!kill 17832' to kill it.)

In [12]:
# Save the model
#model_new.save('emotion_analysis')

In [13]:
results = model_new.evaluate(test_ds)




In [14]:
test_dataframe.head()

Unnamed: 0,text,emotion
13039,what makes say,7
7856,sounds great much like something thanks,4
15130,i like horror movies i like scared,7
3269,i saw program t v saw skull operation,1
14892,why take days,7


In [101]:
def get_index_of_max(arr):
    idx = np.argmax(arr)
    return idx

def get_prediction(string):
    probas = model_new.predict([string])
    pred = get_index_of_max(probas)
    return pred

    

In [102]:
get_prediction('I am so happy.')

4

In [103]:
# Classification report for test data
from sklearn.metrics import classification_report

test_dataframe['preds'] = test_dataframe['text'].apply(lambda x: get_prediction(x))
test_dataframe['label'] = test_dataframe['preds'].apply(lambda x: emotion_label_dict.get(x,None))

print(classification_report(test_dataframe['emotion'], test_dataframe['preds']))


              precision    recall  f1-score   support

           0       0.96      0.91      0.93       494
           1       0.93      0.94      0.93       326
           2       0.98      0.96      0.97       305
           3       0.91      0.91      0.91       211
           4       0.90      0.97      0.93       547
           5       0.99      0.97      0.98       212
           6       0.88      0.89      0.89       466
           7       0.98      0.91      0.94       561
           8       0.94      0.96      0.95       263
           9       0.91      0.97      0.94       436

    accuracy                           0.93      3821
   macro avg       0.94      0.94      0.94      3821
weighted avg       0.94      0.93      0.93      3821



In [104]:
test_dataframe.head()

Unnamed: 0,text,emotion,preds,label
13039,what makes say,7,6,neutral
7856,sounds great much like something thanks,4,4,happy
15130,i like horror movies i like scared,7,6,neutral
3269,i saw program t v saw skull operation,1,1,disgust
14892,why take days,7,6,neutral
