# Libraries and Files


In [2]:
import pandas as pd
import numpy as np
import re
import torch
import ast
import pickle


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

elmo = hub.load("https://tfhub.dev/google/elmo/3").signatures["default"]

In [4]:
df=pd.read_csv("../Data/Datasets/emojify_cleaned_10k_labelled.csv")

In [5]:
df.head(20)

Unnamed: 0,Tokens,Sentiment_score,Sentiment_emotion,Part_of_speech
0,"['if', 'not', 'later', ',', 'when', '?', '🍑']",1,Joy,NOUN
1,"['🔗', '|', 'the', 'izombie', '4x08', 'page', '...",0,Surprise,NOUN
2,"['hearing', 'bts', 'at', 'work', 'still', 'ama...",1,Love,VERB
3,"['ayyy', 'this', 'is', 'lit', '🔥']",1,Joy,ADJ
4,"['well', 'richard', 'i', 'jumped', 'in', 'the'...",-1,Anger,ADJ
5,"[""don't"", 'worry', 'benny', 'was', 'equally', ...",-1,Disgust,ADJ
6,"['it', 'be', 'so', 'funny', 'when', 'the', 'sa...",-1,Sadness,VERB
7,"['dead', 'lost', 'blind', 'and', 'deaf', 'paga...",-1,Disgust,ADJ
8,"['jimin', 'chimmy', '😍']",1,Love,PROPN
9,"['oh', 'god', 'i', 'have', 'work', 'tmr', 'mor...",-1,Sadness,VERB


## Getting Embeddings

In [87]:
df['Sentence'] = df['Tokens'].apply(lambda x: ' '.join(eval(x)))

df.head(20)

Unnamed: 0,Tokens,Sentiment_score,Sentiment_emotion,Part_of_speech,Sentence
0,"['if', 'not', 'later', ',', 'when', '?', '🍑']",1,Joy,NOUN,"if not later , when ? 🍑"
1,"['🔗', '|', 'the', 'izombie', '4x08', 'page', '...",0,Surprise,NOUN,🔗 | the izombie 4x08 page is now up ! containi...
2,"['hearing', 'bts', 'at', 'work', 'still', 'ama...",1,Love,VERB,hearing bts at work still amazes me 😍
3,"['ayyy', 'this', 'is', 'lit', '🔥']",1,Joy,ADJ,ayyy this is lit 🔥
4,"['well', 'richard', 'i', 'jumped', 'in', 'the'...",-1,Anger,ADJ,well richard i jumped in the shower and saved ...
5,"[""don't"", 'worry', 'benny', 'was', 'equally', ...",-1,Disgust,ADJ,don't worry benny was equally unimpressed with...
6,"['it', 'be', 'so', 'funny', 'when', 'the', 'sa...",-1,Sadness,VERB,it be so funny when the same niggas be in you ...
7,"['dead', 'lost', 'blind', 'and', 'deaf', 'paga...",-1,Disgust,ADJ,dead lost blind and deaf pagans all have some ...
8,"['jimin', 'chimmy', '😍']",1,Love,PROPN,jimin chimmy 😍
9,"['oh', 'god', 'i', 'have', 'work', 'tmr', 'mor...",-1,Sadness,VERB,oh god i have work tmr morning 😭


In [None]:
# Helper function to check if a token is an emoji
def is_emoji(token):
    emoji_pattern = re.compile(
        "["  # Emoji ranges
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U000000A9-\U000000AE"  # © (Copyright) and ® (Registered)
        "\U0000203C-\U0000203D"  # Exclamation/question marks
        "\U00002194-\U00002199"  # Arrows
        "\U00002300-\U000023FF"  # Miscellaneous Technical
        "\U00002B00-\U00002BFF"  # Miscellaneous Symbols and Arrows
        "]+", flags=re.UNICODE
    )
    return bool(emoji_pattern.match(token))

# Function to find emoji indices in a tokenized tweet
def find_emoji_indices(tokenized_tweet):
    emoji_indices = [index for index, token in enumerate(tokenized_tweet) if is_emoji(token)]
    return emoji_indices


In [89]:
# Function to extract ELMo embeddings for emojis in a tweet
def get_emoji_embeddings(row):
    tokens = eval(row["Tokens"])  # Convert string representation to a list
    sentence = " ".join(tokens)  # Reconstruct the sentence
    emoji_index = find_emoji_indices(tokens)[0]  # Find the emoji index in the tokenized tweet

    if not emoji_index:  # Skip if no emojis are found
        return None

    # Get ELMo embeddings for the sentence
    elmo_output = elmo(tf.constant([sentence]))["elmo"].numpy()

    # Extract embeddings for each emoji
    emoji_embeddings = elmo_output[0, emoji_index, :]
    return emoji_embeddings

# Apply the function to each row in the dataframe
embeddings_with_indices = {}
for idx, row in df.iterrows():
    try:
        print('index', idx)
        emoji_embeddings = get_emoji_embeddings(row)
        if emoji_embeddings is not None:
            embeddings_with_indices[idx]=emoji_embeddings
    except Exception as e:
        print(f"Error processing row {idx}: {e}")

# Save results into a pickle file
with open("emoji_10k_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings_with_indices, f)

print("Embeddings successfully saved to emoji_10k_embeddings.pkl")

index 0
index 1
index 2
index 3
index 4
index 5
index 6
index 7
index 8
index 9
index 10
index 11
index 12
index 13
index 14
index 15
index 16
index 17
index 18
index 19
index 20
index 21
index 22
index 23
index 24
index 25
index 26
index 27
index 28
index 29
index 30
index 31
index 32
index 33
index 34
index 35
index 36
index 37
index 38
index 39
index 40
index 41
index 42
index 43
index 44
index 45
index 46
index 47
index 48
index 49
index 50
index 51
index 52
index 53
index 54
index 55
index 56
index 57
index 58
index 59
index 60
index 61
index 62
index 63
index 64
index 65
index 66
index 67
index 68
index 69
index 70
index 71
index 72
index 73
index 74
index 75
index 76
index 77
index 78
index 79
index 80
index 81
index 82
index 83
index 84
index 85
index 86
index 87
index 88
index 89
index 90
index 91
index 92
index 93
index 94
index 95
index 96
index 97
index 98
index 99
index 100
index 101
index 102
index 103
index 104
index 105
index 106
index 107
index 108
index 109
index 110


In [None]:
# Load the pickle file
with open("emoji_10k_embeddings.pkl", "rb") as f:
    emoji_data = pickle.load(f)


9441
