In [72]:
import numpy as np
import pandas as pd
import emoji

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [73]:
data = pd.read_csv("emoji_data.csv", header=None)
data.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [74]:
emoji_dictionary = {
    0: ":red_heart:",
    1: ":baseball:",
    2: ":grinning_face_with_big_eyes:",
    3: ":disappointed_face:",
    4: ":fork_and_knife_with_plate:"
}

def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[label])

In [75]:
X = data[0].values
Y = data[1].values

Y

array([4, 3, 3, 1, 2, 1, 4, 3, 4, 1, 3, 3, 2, 2, 4, 3, 2, 3, 3, 1, 3, 2,
       2, 2, 0, 1, 0, 4, 2, 0, 2, 0, 0, 3, 4, 0, 2, 1, 3, 1, 0, 4, 0, 3,
       0, 4, 2, 3, 4, 2, 2, 3, 0, 2, 2, 3, 2, 3, 2, 2, 3, 3, 0, 2, 3, 0,
       2, 0, 0, 2, 3, 2, 4, 1, 3, 3, 0, 0, 3, 2, 0, 3, 0, 2, 2, 4, 2, 2,
       0, 0, 2, 3, 0, 4, 2, 1, 2, 3, 3, 2, 3, 0, 3, 0, 2, 0, 2, 3, 4, 3,
       1, 3, 4, 3, 2, 3, 3, 3, 1, 4, 4, 2, 2, 1, 1, 2, 3, 2, 3, 4, 2, 3,
       0, 2, 0, 0, 4, 3, 4, 2, 3, 2, 3, 4, 2, 1, 2, 4, 3, 1, 3, 2, 3, 2,
       2, 3, 3, 2, 4, 0, 0, 0, 3, 0, 0, 1, 1, 2, 2, 2, 0, 3, 2, 3, 3, 1,
       2, 2, 4, 2, 3, 1, 2], dtype=int64)

Embeddings

In [76]:
with open('data/glove_dataset/glove.6B.100d.txt','r', encoding='utf8') as file:
    content = file.readlines()

In [77]:
embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype=float)

convert input text into tokens

In [78]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word_to_index = tokenizer.word_index


In [79]:
Xtokens = tokenizer.texts_to_sequences(X)


In [80]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    
    return maxlen
maxlen = get_maxlen(Xtokens)

In [81]:
Xtrain = pad_sequences(Xtokens, maxlen = maxlen, padding = 'post', truncating = 'post')

In [82]:
Ytrain = to_categorical(Y)

Model

In [83]:
embed_size = 100

In [84]:
embedding_matrix = np.zeros((len(word_to_index)+1, embed_size))

for word, i in word_to_index.items():
    embed_vector = embeddings[word]
    embedding_matrix[i] = embed_vector

In [85]:
model = Sequential([
    Embedding(input_dim = len(word_to_index)+1,
              output_dim = embed_size,
              input_length=maxlen,
              weights = [embedding_matrix],
              trainable = False
              ),
    LSTM(units = 16, return_sequences=True),
    LSTM(units = 4),
    Dense(5, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [90]:
model.fit(Xtrain, Ytrain, epochs=1000)

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoc

KeyboardInterrupt: 

In [91]:
test = ["I am trying", "I want to cry", "This is just sad"]

test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen=maxlen, padding='post', truncating='post')


y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis=1)
y_pred



array([3, 2, 2], dtype=int64)

In [97]:
for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

I am trying 😞
I want to cry 😃
This is just sad 😃
