In [26]:
import numpy as np
import pandas as pd
import emoji
import datetime

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split  # Import for splitting the data
from gensim.models import KeyedVectors

In [257]:
data = pd.read_csv('data/emoji_data/emoji_data.csv', header=None)
data.head()

Unnamed: 0,0,1
0,When your alarm goes off for the fifth time,9
1,That moment when someone eats the last slice o...,7
2,When you finally finish a project,8
3,Me trying to understand the group chat,11
4,When the music is too loud at the party,18


Emoji Dictionary

In [None]:
emoji_dictionary = {
    0: ":red_heart:",  # Love #
    1: ":face_with_tears_of_joy:",  # Laughter
    2: ":grinning_face_with_big_eyes:", # Happiness #
    3: ":loudly_crying_face:",  # Sadness #
    4: ":smiling_face_with_heart-eyes:",  # Adoration
    5: ":fire:",  # Excitement
    6: ":thumbs_up:",  # Approval
    7: ":folded_hands:",  # Gratitude
    8: ":angry_face:",  # Anger
    9: ":thinking_face:",  # Contemplation
}


Function to get labels from CLDR names

In [36]:
def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[label])

Reading data

In [37]:
data = pd.read_csv('data/emoji_data/emoji_data.csv', header=None)
data.head()

Unnamed: 0,0,1
0,When your alarm goes off for the fifth time,9
1,That moment when someone eats the last slice o...,7
2,When you finally finish a project,8
3,Me trying to understand the group chat,11
4,When the music is too loud at the party,18


In [None]:
X = data[0].values
Y = data[1].values


Embeddings

With glove dataset

In [None]:
# with open('data/glove_dataset/glove.6B.100d.txt','r', encoding='utf8') as file:
#     content = file.readlines()

In [None]:
# embeddings = {}

# for line in content:
#     line = line.split()
#     embeddings[line[0]] = np.array(line[1:], dtype=float)

With crawl dataset

In [None]:
fasttext_model_path = "data/fast_text/crawl_dataset/crawl-300d-2M-subword.vec"
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_model_path)


In [None]:
# Convert FastText embeddings to a dictionary
embeddings = {}
for word in fasttext_model.index_to_key:
    embeddings[word] = fasttext_model.get_vector(word)


Convert input text into tokens

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word_to_index = tokenizer.word_index


In [None]:
Xtokens = tokenizer.texts_to_sequences(X)


In [None]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    
    return maxlen
maxlen = get_maxlen(Xtokens)

maxlen

In [None]:
Xtrain = pad_sequences(Xtokens, maxlen=maxlen, padding='post', truncating='post')


In [None]:
Ytrain = to_categorical(Y)

Split data into train and test set

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(Xtrain, Ytrain, test_size=0.2, random_state=42)

Model

In [None]:
embed_size = 300

In [None]:
embedding_matrix = np.zeros((len(word_to_index) + 1, embed_size))

for word, i in word_to_index.items():
    if word in embeddings:
        embed_vector = embeddings[word]
        embedding_matrix[i] = embed_vector
    else:
        # Handle out-of-vocabulary words or phrases by aggregating subword embeddings
        phrase_embed_sum = None
        for subword in word.split():
            if subword in embeddings:
                if phrase_embed_sum is None:
                    phrase_embed_sum = embeddings[subword]
                else:
                    phrase_embed_sum += embeddings[subword]
        if phrase_embed_sum is not None:
            # Take the average of subword embeddings
            embedding_matrix[i] = phrase_embed_sum / len(word.split())

In general more layers >>

In [None]:
model = Sequential([
    Embedding(input_dim=len(word_to_index) + 1,
              output_dim=embed_size,
              input_length=maxlen,
              weights=[embedding_matrix],
              trainable=False),
    LSTM(units=256, return_sequences=True),
    LSTM(units=16, return_sequences=True),
    LSTM(units=4, return_sequences=True),
    LSTM(units=2),
    Dense(20, activation='softmax')  # Set output dimensionality to 20
])


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Code to generate file name for saving model

In [None]:
def returnfilename(prefix="model"):
    current_datetime = datetime.datetime.now()
    formatted_datetime = current_datetime.strftime('%Y-%m-%d_%H-%M-%S')
    filename = f"{prefix}{formatted_datetime}"
    return filename


In [None]:
model.fit(Xtrain, Ytrain, epochs=50)

In [None]:
filename = returnfilename()

In [None]:
loss, accuracy = model.evaluate(X_test, Y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Convert predicted and actual labels from one-hot encoded format to integer labels
y_pred_labels = np.argmax(y_pred, axis=1)
Y_test_labels = np.argmax(Y_test, axis=1)

# Compare predicted labels with actual labels
correct_predictions = np.sum(y_pred_labels == Y_test_labels)
total_predictions = len(Y_test_labels)
accuracy = correct_predictions / total_predictions

# Print accuracy
print("Accuracy:", accuracy)

# Display predicted labels and actual labels
for i in range(len(y_pred_labels)):
    if y_pred_labels[i]!=Y_test_labels[i]:
        print("Predicted:", label_to_emoji(y_pred_labels[i]), "Actual:", label_to_emoji(Y_test_labels[i]))


In [None]:
# Make predictions
test = ["I am trying", "I want to cry", "This is just sad"]
test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen=maxlen, padding='post', truncating='post')
y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis=1)

In [None]:
for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))