In [1]:
# Import libraries and modules
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
import keras
import string
import nltk
from autocorrect import Speller
import re
import datetime, os


In [2]:
data = pd.read_csv('datasets/dataset_complete_balanced.csv', encoding='utf8')
print(len(data))
data.head()

19107


Unnamed: 0,emotion,text
0,anger,he looks offended i tell perceived man man anger
1,anger,we school room audiovisual classes we watchin...
2,anger,oh jim i told no pets it make mess house
3,anger,getting terrible meals paying heaps
4,anger,when drinking glass water i found cockroach wa...


In [3]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

data = data.sample(frac=1,random_state=42)
data = data[['text','emotion']]
data.emotion = LabelEncoder().fit_transform(data.emotion)
data.head()

Unnamed: 0,text,emotion
11632,you late,6
9380,hey mary,4
4518,the house fire i saved dog i afraid fire verg...,2
6388,making sexual pass cousin i close personal re...,3
1443,us banks also lost heavily furious bundesbank ...,0


In [4]:
test_dataframe = data.sample(frac=0.2, random_state=1337)
train_dataframe = data.drop(test_dataframe.index)

val_dataframe = train_dataframe.sample(frac=0.25, random_state=1337)
train_dataframe = train_dataframe.drop(val_dataframe.index)


print("Using %d samples for training and %d for validation and %d for test" % (len(train_dataframe), len(val_dataframe),len(test_dataframe)))

Using 11464 samples for training and 3822 for validation and 3821 for test


In [5]:
def dataframe_to_dataset(dataframe):
    data_copy = data.copy()
    labels = data_copy.pop("emotion")
    ds = tf.data.Dataset.from_tensor_slices((dict(data_copy), labels))
    ds = ds.shuffle(buffer_size=len(data_copy))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)
test_ds = dataframe_to_dataset(test_dataframe)

In [6]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'text': <tf.Tensor: shape=(), dtype=string, numpy=b'you look bit dull today what '>}
Target: tf.Tensor(7, shape=(), dtype=int32)


In [7]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)
test_ds = test_ds.batch(32)

In [8]:
# Download pre-trained word embeddings
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"

# Create embedding layer
#hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string)
model_new = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1", dtype=tf.string, input_shape=[], output_shape=[128]),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

model_new.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 128)               124642688 
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 10)                330       
Total params: 124,669,866
Trainable params: 27,178
Non-trainable params: 124,642,688
_________________________________________________________________


In [9]:
%load_ext tensorboard


In [10]:

optimizer = tf.keras.optimizers.Adam(0.01)
model_new.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

tensorboard_cb = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

history = model_new.fit(train_ds, epochs=10, validation_data=val_ds, callbacks = [tensorboard_cb])

%tensorboard --logdir logs

Epoch 1/10
