# Libraries

In [31]:
import pandas as pd
from collections import Counter
import re
pd.set_option('display.max_colwidth', None)
import emoji
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
import ast


# Getting Overall Counts/Statistics

In [32]:
#Recognize emojis
emoji_pattern = re.compile(
        "["  # Emoji ranges
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U000000A9-\U000000AE"  # © (Copyright) and ® (Registered)
        "\U0000203C-\U0000203D"  # Exclamation/question marks
        "\U00002194-\U00002199"  # Arrows
        "\U00002300-\U000023FF"  # Miscellaneous Technical
        "\U00002B00-\U00002BFF"  # Miscellaneous Symbols and Arrows
        "]+", flags=re.UNICODE
    )

In [33]:
original = pd.read_csv("../Data/Datasets/emojify_cleaned_10k_labelled.csv")
original.iloc[[12, 14, 15, 16, 17, 19]]


Unnamed: 0,Tokens,Sentiment_score,Sentiment_emotion,Part_of_speech
12,"['ten', 'talent', '🐊']",0,Surprise,NOUN
14,"['handsome', 'liam', '😍']",1,Love,ADJ
15,"['so', 'glad', 'i', 'seen', 'you', 'two', 'last', 'night', '💕']",1,Love,NOUN
16,"['go', 'listen', 'to', 'this', '!', '!', '🔥', '.']",1,Joy,NOUN
17,"['something', 'about', 'emma', '👎', 'the', 'hair']",-1,Disgust,ADJ
19,"['😭', 'i', 'adore', 'you', 'thank', 'you', 'lexi']",-1,Sadness,ADJ


In [34]:
original['Tokens_list'] = original['Tokens'].apply(ast.literal_eval)
lengths = original['Tokens_list'].apply(len)
stats = {
    "Min": [lengths.min()],
    "Max": [lengths.max()],
    "Mean": [lengths.mean()],
    "StdDev": [lengths.std()],
    "Count": [lengths.count()]
}
stats_df = pd.DataFrame(stats)
stats_df
# import ace_tools as tools; tools.display_dataframe_to_user(name="Token Length Statistics", dataframe=stats_df)

Unnamed: 0,Min,Max,Mean,StdDev,Count
0,1,39,12.3012,7.375332,10000


In [42]:
# Overall Emoji Counts (across all rows in df)
original = pd.read_csv("../Data/Datasets/emojify_cleaned_10k_labelled.csv")
original['idx'] = original.index

# Extract emojis for each row and create an Emoji column
def extract_emojis(tokens):
    emojis = [token for token in eval(tokens) if emoji_pattern.match(token)]
    return ", ".join(emojis)  # Join emojis as a comma-separated string

original['Emoji'] = original['Tokens'].apply(extract_emojis)

original.head()


Unnamed: 0,Tokens,Sentiment_score,Sentiment_emotion,Part_of_speech,idx,Emoji
0,"['if', 'not', 'later', ',', 'when', '?', '🍑']",1,Joy,NOUN,0,🍑
1,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']",0,Surprise,NOUN,1,🔗
2,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']",1,Love,VERB,2,😍
3,"['ayyy', 'this', 'is', 'lit', '🔥']",1,Joy,ADJ,3,🔥
4,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']",-1,Anger,ADJ,4,😡


In [43]:
# Dictionary to store emojis and their row indexes
emoji_row_indexes = {}

# Iterate through rows to find emojis and track their indexes
overall_emoji_occurrences = []  # Flattened list of all emoji occurrences
for index, row in original.iterrows():
    tokens = eval(row['Tokens'])  # Convert string to list
    for token in tokens:
        if emoji_pattern.match(token):
            overall_emoji_occurrences.append(token)  # Track occurrence
            if token not in emoji_row_indexes:
                emoji_row_indexes[token] = []  # Initialize list if emoji is new
            emoji_row_indexes[token].append(index)  # Add row index

# Create the overall emoji counts DataFrame
overall_emoji_counts = pd.DataFrame.from_dict(
    Counter(overall_emoji_occurrences), orient='index', columns=['Overall_Frequency']
)
overall_emoji_counts.index.name = 'Emoji'
overall_emoji_counts = overall_emoji_counts.reset_index().sort_values(by='Overall_Frequency', ascending=False)

# Add the row indexes to the DataFrame for the top emojis
overall_emoji_counts['Row_Indexes'] = overall_emoji_counts['Emoji'].map(emoji_row_indexes)




In [44]:
overall_emoji_counts

Unnamed: 0,Emoji,Overall_Frequency,Row_Indexes
16,😂,1049,"[24, 36, 40, 45, 50, 51, 84, 88, 95, 101, 109, 119, 145, 158, 161, 164, 168, 175, 178, 201, 203, 212, 231, 242, 257, 261, 270, 275, 281, 291, 297, 300, 307, 315, 324, 332, 336, 338, 347, 354, 369, 384, 407, 415, 421, 423, 426, 447, 452, 483, 491, 492, 526, 528, 529, 543, 570, 583, 584, 593, 598, 616, 623, 625, 655, 677, 679, 680, 698, 706, 708, 721, 733, 734, 735, 755, 762, 768, 796, 797, 802, 807, 817, 824, 826, 827, 831, 850, 852, 855, 857, 858, 868, 875, 879, 898, 906, 908, 930, 939, ...]"
18,️,850,"[25, 64, 78, 94, 105, 106, 112, 115, 144, 172, 186, 224, 239, 273, 288, 301, 311, 316, 317, 319, 322, 327, 341, 358, 360, 368, 396, 406, 431, 438, 472, 479, 484, 493, 499, 504, 520, 524, 553, 562, 567, 572, 599, 612, 620, 638, 657, 669, 699, 739, 740, 741, 748, 772, 776, 783, 785, 804, 840, 861, 865, 867, 876, 880, 886, 889, 896, 911, 921, 925, 941, 967, 977, 1011, 1027, 1032, 1054, 1061, 1080, 1095, 1104, 1108, 1111, 1114, 1119, 1145, 1148, 1154, 1175, 1178, 1192, 1194, 1213, 1230, 1248, 1263, 1266, 1328, 1330, 1349, ...]"
39,❤,534,"[53, 77, 78, 140, 144, 186, 217, 252, 288, 299, 301, 311, 316, 319, 327, 328, 358, 360, 364, 368, 406, 431, 438, 484, 499, 502, 524, 612, 620, 638, 657, 668, 699, 737, 739, 741, 748, 759, 769, 776, 785, 803, 840, 842, 853, 889, 911, 967, 1011, 1023, 1032, 1054, 1076, 1079, 1104, 1148, 1154, 1191, 1192, 1213, 1230, 1248, 1263, 1266, 1317, 1328, 1349, 1370, 1408, 1436, 1459, 1467, 1477, 1545, 1553, 1554, 1576, 1582, 1587, 1597, 1615, 1628, 1633, 1638, 1657, 1674, 1706, 1735, 1737, 1750, 1768, 1771, 1776, 1780, 1798, 1806, 1809, 1837, 1839, 1889, ...]"
6,😭,375,"[6, 9, 11, 19, 23, 27, 35, 42, 159, 162, 165, 211, 233, 250, 253, 398, 413, 446, 463, 469, 473, 490, 517, 554, 604, 621, 687, 723, 728, 774, 798, 816, 836, 859, 953, 960, 992, 1072, 1117, 1134, 1163, 1193, 1216, 1228, 1246, 1255, 1258, 1261, 1265, 1276, 1312, 1316, 1350, 1380, 1409, 1412, 1429, 1454, 1518, 1590, 1624, 1659, 1668, 1727, 1739, 1817, 1888, 1940, 1963, 1974, 2006, 2088, 2107, 2158, 2166, 2214, 2239, 2262, 2295, 2316, 2320, 2321, 2330, 2348, 2373, 2389, 2421, 2434, 2453, 2463, 2510, 2532, 2553, 2554, 2625, 2641, 2669, 2683, 2742, 2775, ...]"
2,😍,336,"[2, 8, 14, 118, 125, 131, 147, 151, 206, 213, 215, 256, 266, 295, 305, 391, 394, 488, 527, 532, 537, 592, 613, 639, 647, 744, 763, 834, 878, 1008, 1018, 1021, 1097, 1101, 1147, 1151, 1183, 1235, 1271, 1325, 1361, 1372, 1399, 1464, 1469, 1485, 1525, 1534, 1544, 1555, 1600, 1623, 1639, 1698, 1733, 1774, 1782, 1816, 1854, 1857, 1869, 1893, 1906, 1927, 1933, 1956, 1968, 2014, 2019, 2098, 2103, 2114, 2117, 2130, 2142, 2143, 2175, 2177, 2236, 2270, 2280, 2292, 2314, 2356, 2424, 2437, 2474, 2556, 2561, 2613, 2622, 2626, 2705, 2748, 2845, 2859, 2935, 3000, 3098, 3106, ...]"
...,...,...,...
697,🏍,1,[7760]
698,점,1,[7817]
699,🚷,1,[7862]
700,📩,1,[7867]


In [46]:
print(original.iloc[64])

Tokens               ['and', 'if', 'you', 'send', 'your', 'soundcloud', 'link', 'you', 'getting', 'blocked', '☺', '️']
Sentiment_score                                                                                                      1
Sentiment_emotion                                                                                                  Joy
Part_of_speech                                                                                                     ADJ
idx                                                                                                                 64
Emoji                                                                                                             ☺, ️
Name: 64, dtype: object


# Manual Checker

In [247]:
df.head(50)

Unnamed: 0,Tokens,Sentiment_score,Sentiment_emotion,Part_of_speech
0,"['if', 'not', 'later', ',', 'when', '?', '🍑']",1,Joy,NOUN
1,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']",0,Surprise,NOUN
2,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']",1,Love,VERB
3,"['ayyy', 'this', 'is', 'lit', '🔥']",1,Joy,ADJ
4,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']",-1,Anger,ADJ
5,"[""don't"", 'worry', 'benny', 'was', 'equally', 'unimpressed', 'with', 'how', 'i', 'handled', 'the', 'news', '🙄']",-1,Disgust,ADJ
6,"['it', 'be', 'so', 'funny', 'when', 'the', 'same', 'niggas', 'be', 'in', 'you', 'and', 'yo', 'friends', 'dm', '😭', 'like', 'boy', 'just', 'how', ""y'all"", 'sit', 'and', 'talk', 'about', 'hoes', 'we', 'tal', '…']",-1,Sadness,VERB
7,"['dead', 'lost', 'blind', 'and', 'deaf', 'pagans', 'all', 'have', 'some', 'form', 'of', '-', 'luck', 'gods', '🤢', '""']",-1,Disgust,ADJ
8,"['jimin', 'chimmy', '😍']",1,Love,PROPN
9,"['oh', 'god', 'i', 'have', 'work', 'tmr', 'morning', '😭']",-1,Sadness,VERB


In [224]:
# Summary of unique values and their counts
sentiment_emotion_summary = df['Sentiment_emotion'].value_counts()

# Display the summary
print("Summary of Sentiment_emotion column:")
print(sentiment_emotion_summary)


Summary of Sentiment_emotion column:
Sentiment_emotion
Joy          4271
Love         2087
Surprise     1264
Sadness       919
Disgust       676
Fear          335
sadness       208
Anger         179
disgust        44
fear            9
Confusion       4
anger           2
Calm            1
surprise        1
Name: count, dtype: int64


In [267]:
print(len(filtered_df))

2087


In [473]:
filtered_df = ada_merged_df[ada_merged_df['Sentiment_emotion'].str.lower()== 'joy']

# Create a list to track emoji occurrences and their original indices
emoji_occurrences = []
for original_idx, row in filtered_df.iterrows():
    tokens = eval(row['Tokens'])  # Convert string to list
    emojis_in_row = [token for token in tokens if emoji_pattern.match(token)]
    for emoji in emojis_in_row:
        emoji_occurrences.append({'Emoji': emoji, 'Original_Index': original_idx})

# Create a DataFrame of emoji occurrences
emoji_occurrences_df = pd.DataFrame(emoji_occurrences)
emoji_occurrences_df



Unnamed: 0,Emoji,Original_Index
0,🍑,0
1,🔥,3
2,🐊,12
3,🔥,16
4,👅,18
...,...,...
4115,😂,8990
4116,😂,8991
4117,😂,8994
4118,🎶,8995


In [474]:
# Group by emoji to count occurrences and list row indices
emoji_summary_df = emoji_occurrences_df.groupby('Emoji').agg({
    'Original_Index': list  # Collect all original row indices
}).reset_index()

# Add a frequency column
emoji_summary_df['Frequency'] = emoji_summary_df['Original_Index'].apply(len)

# Sort by frequency
emoji_summary_df = emoji_summary_df.sort_values(by='Frequency', ascending=False)

# Display the DataFrame
emoji_summary_df

Unnamed: 0,Emoji,Original_Index,Frequency
234,😂,"[24, 36, 40, 45, 50, 51, 84, 88, 95, 101, 109, 119, 145, 158, 161, 164, 168, 175, 178, 201, 203, 212, 231, 242, 257, 261, 270, 275, 281, 291, 297, 300, 307, 315, 324, 332, 336, 338, 347, 354, 369, 384, 407, 415, 421, 423, 426, 447, 452, 483, 491, 492, 526, 528, 529, 543, 570, 583, 584, 593, 598, 616, 623, 625, 655, 677, 679, 680, 698, 706, 708, 721, 733, 734, 735, 755, 762, 768, 796, 797, 802, 807, 817, 824, 826, 827, 831, 850, 852, 855, 857, 858, 868, 875, 879, 898, 906, 908, 930, 939, ...]",955
242,😊,"[37, 204, 262, 276, 387, 410, 411, 418, 476, 547, 590, 660, 662, 667, 671, 688, 754, 825, 837, 901, 1025, 1029, 1044, 1049, 1073, 1144, 1203, 1277, 1313, 1334, 1364, 1482, 1557, 1579, 1588, 1614, 1678, 1703, 1718, 1732, 1742, 1787, 1823, 1894, 1941, 2111, 2153, 2208, 2256, 2260, 2261, 2275, 2301, 2452, 2552, 2559, 2595, 2618, 2671, 2734, 2792, 2894, 2917, 3068, 3157, 3236, 3292, 3423, 3434, 3560, 3569, 3577, 3591, 3612, 3619, 3642, 3674, 3678, 3762, 3763, 3783, 3793, 3813, 3841, 3886, 3968, 4099, 4176, 4186, 4221, 4248, 4318, 4375, 4492, 4498, 4549, 4577, 4657, 4674, 4702, ...]",195
40,️,"[25, 64, 317, 322, 520, 599, 772, 783, 804, 867, 921, 977, 1061, 1080, 1108, 1114, 1119, 1175, 1178, 1395, 1488, 1501, 1537, 1592, 1601, 1604, 1682, 1686, 1723, 1791, 1811, 1859, 1969, 2038, 2046, 2047, 2122, 2148, 2229, 2272, 2332, 2377, 2445, 2512, 2518, 2530, 2584, 2638, 2679, 2741, 2838, 2895, 2931, 2987, 3020, 3148, 3450, 3485, 3515, 3527, 3602, 3644, 3653, 3667, 3687, 3718, 3825, 3884, 3990, 3993, 4047, 4117, 4166, 4177, 4206, 4228, 4271, 4276, 4335, 4406, 4503, 4566, 4622, 4668, 4745, 4771, 4772, 4805, 4852, 4884, 4973, 4997, 5058, 5188, 5197, 5261, 5289, 5312, 5427, 5444, ...]",159
229,🔥,"[3, 16, 98, 152, 182, 219, 379, 412, 449, 455, 477, 521, 523, 614, 643, 711, 800, 847, 951, 1116, 1240, 1241, 1253, 1314, 1323, 1458, 1474, 1510, 1521, 1583, 1736, 1803, 1863, 1997, 2139, 2164, 2189, 2224, 2324, 2368, 2490, 2615, 2703, 2811, 2887, 2923, 2951, 2982, 3034, 3058, 3066, 3197, 3249, 3394, 3495, 3528, 3545, 3617, 3628, 3671, 3735, 3777, 3798, 3873, 3897, 4149, 4184, 4260, 4267, 4329, 4457, 4510, 4523, 4528, 4619, 4630, 4729, 4742, 4762, 4933, 5037, 5079, 5092, 5141, 5506, 5583, 5622, 5647, 5746, 5755, 5767, 5811, 6001, 6074, 6102, 6142, 6151, 6250, 6290, 6303, ...]",134
222,💯,"[44, 99, 238, 290, 294, 456, 460, 587, 693, 811, 838, 873, 1050, 1197, 1222, 1371, 1460, 1472, 1700, 1796, 2084, 2190, 2268, 2300, 2328, 2455, 2544, 2853, 2854, 2903, 2945, 2977, 2978, 3011, 3046, 3087, 3179, 3324, 3326, 3328, 3473, 3594, 3634, 3715, 3785, 3928, 3997, 4147, 4411, 4416, 4442, 4508, 4564, 4723, 4735, 4910, 5105, 5242, 5249, 5296, 5456, 5595, 5698, 5735, 5785, 5843, 6065, 6186, 6311, 6398, 6752, 6759, 6813, 6930, 7070, 7123, 7165, 7209, 7305, 7330, 7527, 7553, 7598, 7636, 7657, 7726, 7847, 7916, 7942, 7956, 7969, 7988, 8032, 8079, 8158, 8238, 8274, 8386, 8459, 8462, ...]",106
...,...,...,...
1,⌒,[6028],1
295,🦆,[5847],1
296,🦉,[5580],1
0,©,[7529],1


In [472]:
len(filtered_df)

1859

In [284]:
emoji_summary_df['Frequency'].sum()

np.int64(134)

In [20]:
#Get location of specific emoji
print(original.iloc[25]['Tokens']) # Replace 1 with the actual row index

['baby', 'elephant', 'chasing', 'birds', '☺', '️']


In [196]:
anger_count = len(df[df['Sentiment_emotion'].str.lower() == 'disgust'])

# Print the result
print(f"Number of rows where Sentiment_emotion == 'anger': {anger_count}")

Number of rows where Sentiment_emotion == 'anger': 720


# Pipeline for Getting Emoji Ranks

In [213]:
filters = [
    ("Sentiment_score == 1", "Sentiment_Score_1"),
    ("Sentiment_score == 0", "Sentiment_Score_0"),
    ("Sentiment_score == -1", "Sentiment_Score_-1"),
    ("Sentiment_emotion.str.lower() == 'joy'", "Emotion_Joy"),
    ("Sentiment_emotion.str.lower() == 'surprise'", "Emotion_Surprise"),
    ("Sentiment_emotion.str.lower() == 'love'", "Emotion_Love"),
    ("Sentiment_emotion.str.lower() == 'anger'", "Emotion_Anger"),
    ("Sentiment_emotion.str.lower() == 'disgust'", "Emotion_Disgust"),
    ("Sentiment_emotion.str.lower() == 'sadness'", "Emotion_Sadness"),
    ("Sentiment_emotion.str.lower() == 'fear'", "Emotion_Fear"),
    ("Part_of_speech.str.lower() == 'noun'", "Part_NOUN"),
    ("Part_of_speech.str.lower()=='verb'", "Part_VERB"),
    ("Part_of_speech.str.lower()=='adj'", "Part_ADJ"),
]


In [225]:
def analyze_emojis_by_category(df, filters):
    """
    Analyze top emojis for different categories and return a summary DataFrame.

    Args:
    df (pd.DataFrame): The main DataFrame containing emojis and sentiment data.
    filters (list): List of filter conditions (e.g., Sentiment_score=-1).

    Returns:
    pd.DataFrame: Summary DataFrame with top emojis for each category, their frequencies, and category percentages.
    """
    results = []

    for filter_condition, category_name in filters:
        # Apply filter to DataFrame
        filtered_df = df.query(filter_condition)

        # Create a list to track emoji occurrences
        emoji_occurrences = []
        for original_idx, row in filtered_df.iterrows():
            tokens = eval(row['Tokens'])  # Convert string to list
            emojis_in_row = [token for token in tokens if emoji_pattern.match(token)]
            for emoji in emojis_in_row:
                emoji_occurrences.append({'Emoji': emoji, 'Original_Index': original_idx})

        # Create a DataFrame of emoji occurrences
        emoji_occurrences_df = pd.DataFrame(emoji_occurrences)

        if emoji_occurrences_df.empty:
            # Skip this category if no emojis are found
            continue

        # Group by emoji to count occurrences
        emoji_summary_df = emoji_occurrences_df.groupby('Emoji').agg({
            'Original_Index': list
        }).reset_index()

        # Add a frequency column
        emoji_summary_df['Frequency'] = emoji_summary_df['Original_Index'].apply(len)

        # Calculate the total number of rows in the category
        total_rows_in_category = len(filtered_df)
        category_percentage = (total_rows_in_category / len(df)) * 100 if len(df) > 0 else 0

        # Append results to the list
        for _, row in emoji_summary_df.iterrows():
            frequency = row['Frequency']
            percentage = (frequency / total_rows_in_category) * 100 if total_rows_in_category > 0 else 0
            results.append({
                'Category': category_name,
                'Emoji': row['Emoji'],
                'Frequency': frequency,
                'Percentage': percentage,  # Percentage of this emoji in the category
                'Category_Percentage': f"{category_percentage:.2f}%",
            })

    # Create a summary DataFrame
    result_df = pd.DataFrame(results)
    return result_df


def reshape_results_with_percentage(result_df):
    """
    Reshape the result DataFrame to include the top 3 emojis for each category,
    their percentages within the category, and the category percentages.

    Args:
    result_df (pd.DataFrame): DataFrame with Category, Emoji, Frequency, Percentage, and Category_Percentage.

    Returns:
    pd.DataFrame: Reshaped DataFrame with one row per category, including the top 3 emojis and their percentages.
    """
    reshaped_data = []

    # Group by Category
    grouped = result_df.groupby('Category')
    for category, group in grouped:
        # Sort each group by Frequency
        group = group.sort_values(by='Frequency', ascending=False)

        # Extract top 3 emojis with their percentages
        top_emojis = []
        for _, row in group.head(3).iterrows():
            emoji = row['Emoji']
            percentage = row['Percentage']
            top_emojis.append(f"{emoji} ({percentage:.2f}%)")

        # Ensure exactly 3 emojis are listed
        while len(top_emojis) < 3:
            top_emojis.append(None)

        # Extract category percentage (it's the same for all rows in a group)
        category_percentage = group['Category_Percentage'].iloc[0]

        # Append to reshaped data
        reshaped_data.append({
            'Category': category,
            'Top Emoji': top_emojis[0],
            'Second Emoji': top_emojis[1],
            'Third Emoji': top_emojis[2],
            'Category Percentage': category_percentage
        })

    # Create a DataFrame from the reshaped data
    reshaped_df = pd.DataFrame(reshaped_data)
    return reshaped_df


# Original 10k Dataset (Ground-Truth)

In [226]:
df=pd.read_csv("../Data/Datasets/emojify_cleaned_10k_labelled.csv")

result_df = analyze_emojis_by_category(df, filters)


reshaped_df = reshape_results_with_percentage(result_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,😡 (18.23%),😤 (16.02%),🙏 (12.71%),1.81%
1,Emotion_Disgust,🤔 (28.61%),🙄 (27.08%),😒 (10.56%),7.20%
2,Emotion_Fear,💀 (24.13%),🙃 (18.02%),🤧 (11.05%),3.44%
3,Emotion_Joy,😂 (24.44%),😊 (5.03%),️ (4.75%),42.71%
4,Emotion_Love,❤ (25.59%),️ (20.80%),😍 (15.57%),20.87%
5,Emotion_Sadness,😭 (33.10%),😩 (17.30%),😢 (6.48%),11.27%
6,Emotion_Surprise,👀 (12.09%),️ (10.04%),© (4.35%),12.65%
7,Part_ADJ,🤔 (5.58%),🙄 (5.31%),😍 (5.23%),36.89%
8,Part_NOUN,️ (9.72%),💕 (5.95%),❤ (3.77%),29.42%
9,Part_VERB,😂 (35.54%),❤ (13.08%),😭 (12.46%),28.98%


# ELMO Model

In [308]:
elmo_df=pd.read_csv("./Model Prediction CSVs/ELMO_predictions.csv")

In [309]:
elmo_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,1,joy,NOUN
1,1,1,love,ADJ
2,2,1,love,ADJ
3,3,1,joy,NOUN
4,4,-1,anger,ADJ


In [310]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
elmo_merged_df = elmo_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
elmo_merged_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,1,love,ADJ,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,1,love,ADJ,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,1,joy,NOUN,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,-1,anger,ADJ,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [311]:
result_df = analyze_emojis_by_category(elmo_merged_df, filters)


reshaped_df = reshape_results_with_percentage(result_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,🙏 (28.10%),😡 (19.61%),😤 (16.34%),1.62%
1,Emotion_Disgust,🤔 (31.81%),🙄 (30.29%),😒 (12.01%),6.26%
2,Emotion_Fear,🙃 (29.29%),💀 (26.94%),🤧 (11.78%),3.15%
3,Emotion_Joy,😂 (23.03%),😊 (4.74%),️ (4.25%),45.40%
4,Emotion_Love,❤ (23.42%),️ (19.77%),😍 (14.77%),22.02%
5,Emotion_Sadness,😭 (31.68%),😩 (17.03%),😢 (6.23%),11.57%
6,Emotion_Surprise,👀 (15.39%),️ (14.76%),© (5.63%),9.98%
7,Part_ADJ,😊 (6.50%),😍 (6.47%),🤔 (6.09%),33.07%
8,Part_NOUN,️ (7.06%),💕 (6.92%),🔥 (4.78%),29.26%
9,Part_VERB,😂 (32.48%),❤ (14.65%),😭 (11.53%),31.96%


# GPT-2 Model

In [395]:
gpt_df=pd.read_csv("./Model Prediction CSVs/GPT2_predictions.csv")

In [396]:
gpt_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,1,joy,NOUN
1,1,1,joy,NOUN
2,2,1,joy,VERB
3,3,1,joy,NOUN
4,4,-1,joy,VERB


In [397]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
gpt_merged_df = gpt_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
gpt_merged_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,1,joy,NOUN,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,1,joy,VERB,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,1,joy,NOUN,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,-1,joy,VERB,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [398]:
result_df = analyze_emojis_by_category(gpt_merged_df, filters)


reshaped_df = reshape_results_with_percentage(result_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Joy,😂 (13.70%),️ (5.96%),😍 (4.39%),76.57%
1,Emotion_Love,❤ (28.74%),️ (22.07%),💕 (10.98%),17.85%
2,Emotion_Sadness,😭 (65.23%),😩 (34.77%),,5.58%
3,Part_NOUN,️ (7.51%),🤔 (4.13%),💕 (4.03%),49.82%
4,Part_VERB,😂 (20.33%),❤ (10.22%),️ (9.49%),50.18%
5,Sentiment_Score_-1,😭 (21.63%),😩 (11.53%),😘 (6.42%),16.83%
6,Sentiment_Score_0,👀 (40.16%),👇 (10.64%),🗣 (10.11%),3.76%
7,Sentiment_Score_1,😂 (13.21%),️ (10.68%),❤ (6.72%),79.41%


# ADA

In [391]:
ada_df=pd.read_csv("./Model Prediction CSVs/ADA_predictions.csv")

In [392]:
ada_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,0,joy,NOUN
1,1,0,surprise,NOUN
2,2,1,love,ADJ
3,3,1,joy,NOUN
4,4,-1,anger,ADJ


In [393]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
ada_merged_df = ada_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
ada_merged_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,0,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,0,surprise,NOUN,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,1,love,ADJ,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,1,joy,NOUN,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,-1,anger,ADJ,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [394]:
result_df = analyze_emojis_by_category(ada_merged_df, filters)


reshaped_df = reshape_results_with_percentage(result_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,🙏 (38.51%),😡 (18.92%),😤 (16.89%),1.64%
1,Emotion_Disgust,🤔 (32.46%),🙄 (30.53%),😒 (12.11%),6.33%
2,Emotion_Fear,🙃 (27.24%),💀 (23.84%),🤧 (10.53%),3.59%
3,Emotion_Joy,😂 (24.66%),😊 (5.03%),️ (4.11%),43.03%
4,Emotion_Love,❤ (25.44%),️ (20.44%),😍 (16.41%),20.65%
5,Emotion_Sadness,😭 (33.08%),😩 (17.36%),😢 (6.46%),11.52%
6,Emotion_Surprise,️ (13.52%),👀 (11.42%),📷 (5.29%),13.23%
7,Part_ADJ,😍 (9.33%),😊 (6.04%),🤔 (5.73%),35.85%
8,Part_NOUN,️ (7.87%),💕 (7.32%),🔥 (5.24%),28.39%
9,Part_VERB,😂 (34.43%),❤ (16.58%),😭 (12.36%),30.82%


# BERT Model

In [352]:
bert_df=pd.read_csv("./Model Prediction CSVs/BERT_predictions.csv")

In [353]:
bert_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,1,joy,NOUN
1,1,1,joy,VERB
2,2,-1,joy,VERB
3,3,-1,joy,ADJ
4,4,1,joy,VERB


In [354]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
bert_merged_df = bert_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
bert_merged_df.head()


Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,1,joy,VERB,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,-1,joy,VERB,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,-1,joy,ADJ,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,1,joy,VERB,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [355]:
result_df = analyze_emojis_by_category(bert_merged_df, filters)


reshaped_df = reshape_results_with_percentage(result_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,🏈 (100.00%),,,0.01%
1,Emotion_Fear,💀 (25.00%),😍 (25.00%),😮 (25.00%),0.04%
2,Emotion_Joy,😂 (10.52%),️ (8.42%),❤ (5.17%),81.82%
3,Emotion_Love,😂 (10.59%),️ (9.50%),❤ (6.54%),16.83%
4,Emotion_Sadness,😫 (11.54%),😭 (11.54%),💗 (7.69%),0.26%
5,Emotion_Surprise,😂 (13.73%),️ (4.90%),😭 (4.90%),1.04%
6,Part_ADJ,😂 (8.29%),️ (7.05%),❤ (4.59%),5.78%
7,Part_NOUN,😂 (9.65%),️ (8.41%),❤ (5.08%),49.93%
8,Part_VERB,😂 (11.86%),️ (8.86%),❤ (5.73%),44.25%
9,Sentiment_Score_-1,😂 (10.38%),️ (7.97%),❤ (5.61%),29.26%


# CBOW Model

In [243]:
cbow_df=pd.read_csv("./Model Prediction CSVs/CBOW_predictions.csv")

In [244]:
cbow_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,1,joy,NOUN
1,1,0,surprise,NOUN
2,2,1,love,ADJ
3,3,1,joy,NOUN
4,4,-1,anger,ADJ


In [245]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
cbow_merged_df = cbow_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
cbow_merged_df.head()



Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,0,surprise,NOUN,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,1,love,ADJ,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,1,joy,NOUN,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,-1,anger,ADJ,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [246]:
result_df = analyze_emojis_by_category(cbow_merged_df, filters)


reshaped_df = reshape_results_with_percentage(result_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,🙏 (44.36%),😡 (24.81%),😤 (21.80%),1.33%
1,Emotion_Disgust,🤔 (32.70%),🙄 (31.11%),😒 (12.38%),6.30%
2,Emotion_Fear,🙃 (27.81%),💀 (25.15%),🤧 (11.24%),3.38%
3,Emotion_Joy,😂 (19.19%),️ (13.77%),❤ (9.77%),54.67%
4,Emotion_Love,😍 (24.17%),💕 (14.46%),😘 (8.85%),13.90%
5,Emotion_Sadness,😭 (33.42%),😩 (17.47%),😢 (6.51%),11.22%
6,Emotion_Surprise,👀 (16.74%),📷 (7.17%),🗣 (5.33%),9.20%
7,Part_ADJ,😍 (9.90%),😊 (6.48%),🤔 (6.13%),33.62%
8,Part_NOUN,️ (20.25%),❤ (13.63%),💕 (5.16%),38.97%
9,Part_VERB,😂 (40.41%),😭 (14.45%),😩 (7.55%),25.96%
