In [96]:
import numpy as np
import pandas as pd
import emoji

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split  # Import for splitting the data
from gensim.models import KeyedVectors

In [257]:
data = pd.read_csv('data/emoji_data/emoji_data.csv', header=None)
data.head()

Unnamed: 0,0,1
0,When your alarm goes off for the fifth time,9
1,That moment when someone eats the last slice o...,7
2,When you finally finish a project,8
3,Me trying to understand the group chat,11
4,When the music is too loud at the party,18


Emoji Dictionary

In [258]:
emoji_dictionary = {
    0: ":red_heart:",  # Love
    1: ":face_with_tears_of_joy:",  # Laughter
    2: ":smiling_face_with_heart-eyes:",  # Adoration
    3: ":loudly_crying_face:",  # Sadness
    4: ":fire:",  # Excitement
    5: ":thumbs_up:",  # Approval
    6: ":folded_hands:",  # Gratitude
    7: ":angry_face:",  # Anger
    8: ":sparkles:",  # Happiness
    9: ":weary_face:",  # Exhaustion
    10: ":astonished_face:",  # Surprise
    11: ":confused_face:",  # Confusion
    12: ":tropical_drink:",  # Celebration
    13: ":broken_heart:",  # Heartbreak
    14: ":thinking_face:",  # Contemplation
    15: ":sleeping_face:",  # Sleepiness
    16: ":victory_hand:",  # Success
    17: ":thumbs_down:",  # Disapproval
    18: ":grimacing_face:",  # Discomfort
    19: ":smiling_face_with_halo:",  # Innocence
}


In [259]:

def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[label])

In [261]:
X = data[0].values
Y = data[1].values

X

array(['When your alarm goes off for the fifth time ',
       'That moment when someone eats the last slice of pizza ',
       'When you finally finish a project ',
       'Me trying to understand the group chat ',
       'When the music is too loud at the party ',
       'That feeling when you see your crush ',
       'When dessert ruins your healthy eating plans ',
       'Waking up on Monday morning ',
       "Pretending to laugh at your boss's joke ",
       'Pretending to listen to a long story ',
       "Finding out there's free food at work ", 'Stuck in traffic ',
       'Treating yourself even when broke ',
       'Trying to adult but failing ',
       'Trying to stay awake during a boring lecture ',
       "Realizing it's only Wednesday ",
       'Trying to be positive but life keeps testing you ',
       'Making plans for the weekend but ending up in bed ',
       'Being the only one not having fun at a party ',
       "Accidentally liking someone's old post ",
       'Figuri

Embeddings

In [262]:
# with open('data/glove_dataset/glove.6B.100d.txt','r', encoding='utf8') as file:
#     content = file.readlines()

In [148]:
fasttext_model_path = "data/fast_text/crawl_dataset/crawl-300d-2M-subword.vec"
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_model_path)


KeyboardInterrupt: 

In [263]:
# Convert FastText embeddings to a dictionary
embeddings = {}
for word in fasttext_model.index_to_key:
    embeddings[word] = fasttext_model.get_vector(word)


In [264]:
# embeddings = {}

# for line in content:
#     line = line.split()
#     embeddings[line[0]] = np.array(line[1:], dtype=float)

convert input text into tokens

In [265]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word_to_index = tokenizer.word_index


In [266]:
Xtokens = tokenizer.texts_to_sequences(X)


In [267]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    
    return maxlen
maxlen = get_maxlen(Xtokens)

maxlen

11

In [268]:
Xtrain = pad_sequences(Xtokens, maxlen=maxlen, padding='post', truncating='post')


In [269]:
Ytrain = to_categorical(Y)


In [270]:
X_train, X_test, Y_train, Y_test = train_test_split(Xtrain, Ytrain, test_size=0.2, random_state=42)

print(len(X_test))
print(len(X_train))
print(len(Y_test))
print(len(Y_train))

32
128
32
128


Model

In [271]:
embed_size = 300

In [272]:
embedding_matrix = np.zeros((len(word_to_index) + 1, embed_size))

for word, i in word_to_index.items():
    if word in embeddings:
        embed_vector = embeddings[word]
        embedding_matrix[i] = embed_vector
    else:
        # Handle out-of-vocabulary words or phrases by aggregating subword embeddings
        phrase_embed_sum = None
        for subword in word.split():
            if subword in embeddings:
                if phrase_embed_sum is None:
                    phrase_embed_sum = embeddings[subword]
                else:
                    phrase_embed_sum += embeddings[subword]
        if phrase_embed_sum is not None:
            # Take the average of subword embeddings
            embedding_matrix[i] = phrase_embed_sum / len(word.split())

In [279]:
model = Sequential([
    Embedding(input_dim=len(word_to_index) + 1,
              output_dim=embed_size,
              input_length=maxlen,
              weights=[embedding_matrix],
              trainable=False),
    LSTM(units=50, return_sequences=True),
    LSTM(units=50),
    Dense(20, activation='softmax')  # Set output dimensionality to 20
])


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [280]:
model.fit(Xtrain, Ytrain, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1e3d2b43460>

In [281]:
loss, accuracy = model.evaluate(X_test, Y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.15355908870697021
Test Accuracy: 0.96875


In [282]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Convert predicted and actual labels from one-hot encoded format to integer labels
y_pred_labels = np.argmax(y_pred, axis=1)
Y_test_labels = np.argmax(Y_test, axis=1)

# Compare predicted labels with actual labels
correct_predictions = np.sum(y_pred_labels == Y_test_labels)
total_predictions = len(Y_test_labels)
accuracy = correct_predictions / total_predictions

# Print accuracy
print("Accuracy:", accuracy)

# Display predicted labels and actual labels
for i in range(len(y_pred_labels)):
    print("Predicted:", label_to_emoji(y_pred_labels[i]), "Actual:", label_to_emoji(Y_test_labels[i]))


Accuracy: 0.96875
Predicted: ✌️ Actual: ✌️
Predicted: 😕 Actual: 😕
Predicted: 👍 Actual: 👍
Predicted: 😩 Actual: 😩
Predicted: 😂 Actual: 😂
Predicted: 😬 Actual: 😬
Predicted: 😴 Actual: 😴
Predicted: 😠 Actual: 😕
Predicted: 🤔 Actual: 🤔
Predicted: 🙏 Actual: 🙏
Predicted: 😲 Actual: 😲
Predicted: ✌️ Actual: ✌️
Predicted: 😩 Actual: 😩
Predicted: 😠 Actual: 😠
Predicted: 😭 Actual: 😭
Predicted: 😬 Actual: 😬
Predicted: 😕 Actual: 😕
Predicted: 🍹 Actual: 🍹
Predicted: 😠 Actual: 😠
Predicted: 😬 Actual: 😬
Predicted: 😠 Actual: 😠
Predicted: 😩 Actual: 😩
Predicted: ✨ Actual: ✨
Predicted: 😬 Actual: 😬
Predicted: 😩 Actual: 😩
Predicted: ✌️ Actual: ✌️
Predicted: 😩 Actual: 😩
Predicted: 😬 Actual: 😬
Predicted: 🤔 Actual: 🤔
Predicted: 🤔 Actual: 🤔
Predicted: 😇 Actual: 😇
Predicted: 🙏 Actual: 🙏


In [283]:
# Make predictions
test = ["I am trying", "I want to cry", "This is just sad"]
test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen=maxlen, padding='post', truncating='post')
y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis=1)



In [288]:
for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

I am trying 😭
I want to cry 😭
This is just sad 😩
