In [1]:
import numpy as np
import pandas as pd
import emoji
import datetime

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split  # Import for splitting the data
from gensim.models import KeyedVectors

2024-02-24 21:59:03.211105: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-24 21:59:03.616830: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 21:59:03.616985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 21:59:03.665365: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-24 21:59:03.767102: I tensorflow/core/platform/cpu_feature_guar

Emoji Dictionary

In [2]:
emoji_dictionary = {
    0: ":red_heart:",  # Love
    1: ":face_with_tears_of_joy:",  # Laughter
    2: ":smiling_face_with_heart-eyes:",  # Adoration
    3: ":loudly_crying_face:",  # Sadness
    4: ":fire:",  # Excitement
    5: ":thumbs_up:",  # Approval
    6: ":folded_hands:",  # Gratitude
    7: ":angry_face:",  # Anger
    8: ":sparkles:",  # Happiness
    9: ":weary_face:",  # Exhaustion
    10: ":astonished_face:",  # Surprise
    11: ":confused_face:",  # Confusion
    12: ":tropical_drink:",  # CelebrationS
    13: ":broken_heart:",  # Heartbreak
    14: ":thinking_face:",  # Contemplation
    15: ":sleeping_face:",  # Sleepiness
    16: ":victory_hand:",  # Success
    17: ":thumbs_down:",  # Disapproval
    18: ":grimacing_face:",  # Discomfort
    19: ":smiling_face_with_halo:",  # Innocence
}


Function to get labels from CLDR names

In [3]:
def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[label])

Reading data

In [4]:
data = pd.read_csv('data/emoji_data/emoji_data.csv', header=None)
data.head()

Unnamed: 0,0,1
0,When your alarm goes off for the fifth time,9
1,That moment when someone eats the last slice o...,7
2,When you finally finish a project,8
3,Me trying to understand the group chat,11
4,When the music is too loud at the party,18


In [5]:
X = data[0].values
Y = data[1].values


Embeddings

With glove dataset

In [6]:
# with open('data/glove_dataset/glove.6B.100d.txt','r', encoding='utf8') as file:
#     content = file.readlines()

In [7]:
# embeddings = {}

# for line in content:
#     line = line.split()
#     embeddings[line[0]] = np.array(line[1:], dtype=float)

With crawl dataset

In [8]:
fasttext_model_path = "data/fast_text/crawl_dataset/crawl-300d-2M-subword.vec"
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_model_path)


In [9]:
# Convert FastText embeddings to a dictionary
embeddings = {}
for word in fasttext_model.index_to_key:
    embeddings[word] = fasttext_model.get_vector(word)


Convert input text into tokens

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word_to_index = tokenizer.word_index


In [11]:
Xtokens = tokenizer.texts_to_sequences(X)


In [12]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    
    return maxlen
maxlen = get_maxlen(Xtokens)

maxlen

11

In [13]:
Xtrain = pad_sequences(Xtokens, maxlen=maxlen, padding='post', truncating='post')


In [14]:
Ytrain = to_categorical(Y)

Split data into train and test set

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(Xtrain, Ytrain, test_size=0.2, random_state=42)

Model

In [16]:
embed_size = 300

In [17]:
embedding_matrix = np.zeros((len(word_to_index) + 1, embed_size))

for word, i in word_to_index.items():
    if word in embeddings:
        embed_vector = embeddings[word]
        embedding_matrix[i] = embed_vector
    else:
        # Handle out-of-vocabulary words or phrases by aggregating subword embeddings
        phrase_embed_sum = None
        for subword in word.split():
            if subword in embeddings:
                if phrase_embed_sum is None:
                    phrase_embed_sum = embeddings[subword]
                else:
                    phrase_embed_sum += embeddings[subword]
        if phrase_embed_sum is not None:
            # Take the average of subword embeddings
            embedding_matrix[i] = phrase_embed_sum / len(word.split())

Code to generate file name for saving model

In [18]:
def returnfilename(prefix="model"):
    current_datetime = datetime.datetime.now()
    formatted_datetime = current_datetime.strftime('%Y-%m-%d_%H-%M-%S')
    filename = f"{prefix}{formatted_datetime}_epochs_{epochs}_layers_{layer1}_{layer2}_{layer3}_{layer4}"
    return filename


In general more layers >>

In [19]:
epochs = 50
#units
layer1 = 256
layer2 = 16
layer3 = 4
layer4 = 2

In [20]:
model = Sequential([
    Embedding(input_dim=len(word_to_index) + 1,
              output_dim=embed_size,
              input_length=maxlen,
              weights=[embedding_matrix],
              trainable=False),
    LSTM(units=layer1, return_sequences=True),
    LSTM(units=layer2, return_sequences=True),
    LSTM(units=layer3, return_sequences=True),
    LSTM(units=layer4),
    Dense(20, activation='softmax')  # Set output dimensionality to 20
])


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

2024-02-24 22:04:34.489884: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-24 22:04:34.689533: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-24 22:04:34.690004: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [21]:
model.fit(Xtrain, Ytrain, epochs=epochs)

Epoch 1/50


2024-02-24 22:04:42.125167: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-02-24 22:04:42.378367: I external/local_xla/xla/service/service.cc:168] XLA service 0x7ff36e540400 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-24 22:04:42.378399: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2024-02-24 22:04:42.399513: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1708808682.580634    5496 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7ff487f30400>

In [22]:
filename = returnfilename()
model.save(f"models/{filename}.hdf5")

  saving_api.save_model(


In [23]:
loss, accuracy = model.evaluate(X_test, Y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 2.3576197624206543
Test Accuracy: 0.34375


In [24]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Convert predicted and actual labels from one-hot encoded format to integer labels
y_pred_labels = np.argmax(y_pred, axis=1)
Y_test_labels = np.argmax(Y_test, axis=1)

# Compare predicted labels with actual labels
correct_predictions = np.sum(y_pred_labels == Y_test_labels)
total_predictions = len(Y_test_labels)
accuracy = correct_predictions / total_predictions

# Print accuracy
print("Accuracy:", accuracy)

# Display predicted labels and actual labels
for i in range(len(y_pred_labels)):
    if y_pred_labels[i]!=Y_test_labels[i]:
        print("Predicted:", label_to_emoji(y_pred_labels[i]), "Actual:", label_to_emoji(Y_test_labels[i]))


Accuracy: 0.34375
Predicted: 😠 Actual: ✌️
Predicted: 🤔 Actual: 😕
Predicted: 🤔 Actual: 👍
Predicted: 😬 Actual: 😩
Predicted: 😬 Actual: 😂
Predicted: 😠 Actual: 😴
Predicted: 😠 Actual: 😕
Predicted: 😠 Actual: 🙏
Predicted: 🤔 Actual: 😲
Predicted: 😠 Actual: ✌️
Predicted: 🤔 Actual: 😩
Predicted: 😠 Actual: 😭
Predicted: 🤔 Actual: 😕
Predicted: 😬 Actual: 🍹
Predicted: 😬 Actual: 😩
Predicted: 😬 Actual: ✨
Predicted: 😬 Actual: 😩
Predicted: 😠 Actual: ✌️
Predicted: 😬 Actual: 😩
Predicted: 😬 Actual: 😇
Predicted: 😠 Actual: 🙏


In [25]:
# Make predictions
test = ["I am trying", "I want to cry", "This is just sad"]
test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen=maxlen, padding='post', truncating='post')
y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis=1)



In [26]:
for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

I am trying 😠
I want to cry 😠
This is just sad 😬
