# Libraries

In [1]:
import pandas as pd
from collections import Counter
import re
pd.set_option('display.max_colwidth', None)
import emoji
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

'en_US.UTF-8'

# Getting Overall Counts

In [2]:
#Recognize emojis
emoji_pattern = re.compile(
        "["  # Emoji ranges
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "\U000000A9-\U000000AE"  # © (Copyright) and ® (Registered)
        "\U0000203C-\U0000203D"  # Exclamation/question marks
        "\U00002194-\U00002199"  # Arrows
        "\U00002300-\U000023FF"  # Miscellaneous Technical
        "\U00002B00-\U00002BFF"  # Miscellaneous Symbols and Arrows
        "]+", flags=re.UNICODE
    )

In [3]:
# Overall Emoji Counts (across all rows in df)

original = pd.read_csv("../Data/Datasets/emojify_cleaned_10k_labelled.csv")
original['idx'] = original.index

# Extract emojis for each row and create an Emoji column
def extract_emojis(tokens):
    emojis = [token for token in eval(tokens) if emoji_pattern.match(token)]
    return ", ".join(emojis)  # Join emojis as a comma-separated string

original['Emoji'] = original['Tokens'].apply(extract_emojis)

original.head()


Unnamed: 0,Tokens,Sentiment_score,Sentiment_emotion,Part_of_speech,idx,Emoji
0,"['if', 'not', 'later', ',', 'when', '?', '🍑']",1,Joy,NOUN,0,🍑
1,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']",0,Surprise,NOUN,1,🔗
2,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']",1,Love,VERB,2,😍
3,"['ayyy', 'this', 'is', 'lit', '🔥']",1,Joy,ADJ,3,🔥
4,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']",-1,Anger,ADJ,4,😡


In [4]:
# Count overall emoji occurrences
overall_emoji_occurrences = []
for _, row in original.iterrows():
    tokens = eval(row['Tokens'])  # Convert string to list
    emojis_in_row = [token for token in tokens if emoji_pattern.match(token)]
    overall_emoji_occurrences.extend(emojis_in_row)

# Create the overall emoji counts DataFrame
overall_emoji_counts = pd.DataFrame.from_dict(Counter(overall_emoji_occurrences), orient='index', columns=['Overall_Frequency'])
overall_emoji_counts.index.name = 'Emoji'
overall_emoji_counts = overall_emoji_counts.reset_index().sort_values(by='Overall_Frequency', ascending=False)

In [5]:
overall_emoji_counts

Unnamed: 0,Emoji,Overall_Frequency
16,😂,1049
18,️,850
39,❤,534
6,😭,375
2,😍,336
...,...,...
697,🏍,1
698,점,1
699,🚷,1
700,📩,1


# Manual Checker

In [31]:
bert_merged_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,1,joy,VERB,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,-1,joy,VERB,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,-1,joy,ADJ,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,1,joy,VERB,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [32]:
# Summary of unique values and their counts
sentiment_emotion_summary = bert_merged_df['Sentiment_emotion'].value_counts()

# Display the summary
print("Summary of Sentiment_emotion column:")
print(sentiment_emotion_summary)


Summary of Sentiment_emotion column:
Sentiment_emotion
joy         8031
love        1652
surprise     102
sadness       26
fear           4
anger          1
Name: count, dtype: int64


In [36]:
print(len(filtered_df))

26


In [37]:
filtered_df = bert_merged_df[bert_merged_df['Sentiment_emotion'] == 'anger']

# Create a list to track emoji occurrences and their original indices
emoji_occurrences = []
for original_idx, row in filtered_df.iterrows():
    tokens = eval(row['Tokens'])  # Convert string to list
    emojis_in_row = [token for token in tokens if emoji_pattern.match(token)]
    for emoji in emojis_in_row:
        emoji_occurrences.append({'Emoji': emoji, 'Original_Index': original_idx})

# Create a DataFrame of emoji occurrences
emoji_occurrences_df = pd.DataFrame(emoji_occurrences)
emoji_occurrences_df



Unnamed: 0,Emoji,Original_Index
0,🏈,6987


In [38]:
# Group by emoji to count occurrences and list row indices
emoji_summary_df = emoji_occurrences_df.groupby('Emoji').agg({
    'Original_Index': list  # Collect all original row indices
}).reset_index()

# Add a frequency column
emoji_summary_df['Frequency'] = emoji_summary_df['Original_Index'].apply(len)

# Sort by frequency
emoji_summary_df = emoji_summary_df.sort_values(by='Frequency', ascending=False)

# Display the DataFrame
emoji_summary_df

Unnamed: 0,Emoji,Original_Index,Frequency
0,🏈,[6987],1


In [39]:
# Merge the two DataFrames on the 'Emoji' column
merged_df = emoji_summary_df.merge(
    overall_emoji_counts,
    on='Emoji',
    how='left',  # Use 'left' to keep all emojis from emoji_summary_df
    suffixes=('_Filtered', '_Overall')  # To differentiate columns after the merge
)

# Optional: Add a normalized frequency column
merged_df['Normalized_Frequency'] = merged_df['Frequency'] / merged_df['Overall_Frequency']

# Sort the resulting DataFrame by 'Normalized_Frequency' or any other column
merged_df = merged_df.sort_values(by='Normalized_Frequency', ascending=False)

# Display the resulting DataFrame
merged_df = merged_df.sort_values(
    by=['Normalized_Frequency', 'Frequency'], 
    ascending=[False, False]
)

merged_df.head(5)

Unnamed: 0,Emoji,Original_Index,Frequency,Overall_Frequency,Normalized_Frequency
0,🏈,[6987],1,4,0.25


In [165]:
#Get location of specific emoji
print(ada_merged_df.iloc[1017]['Tokens'])  # Replace 1 with the actual row index

['€', '☹', '️follow', 'me', 'and', 'everyone', 'who', 'like', 'or', 'retweet', 'it', '.']


In [81]:
anger_count = len(elmo_merged_df[elmo_merged_df['Sentiment_emotion'].str.lower() == 'anger'])

# Print the result
print(f"Number of rows where Sentiment_emotion == 'anger': {anger_count}")

Number of rows where Sentiment_emotion == 'anger': 1116


# Pipeline for Getting Emoji Ranks

In [7]:
def analyze_emojis_by_category(df, overall_emoji_counts, filters):
    """
    Analyze top emojis for different categories and return a summary DataFrame.

    Args:
    df (pd.DataFrame): The main DataFrame containing emojis and sentiment data.
    overall_emoji_counts (pd.DataFrame): DataFrame with overall emoji counts.
    filters (list): List of filter conditions (e.g., Sentiment_score=-1).

    Returns:
    pd.DataFrame: Summary DataFrame with top 3 emojis for each category.
    """
    results = []

    for filter_condition, category_name in filters:
        # Apply filter to DataFrame
        filtered_df = df.query(filter_condition)

        # Create a list to track emoji occurrences
        emoji_occurrences = []
        for original_idx, row in filtered_df.iterrows():
            tokens = eval(row['Tokens'])  # Convert string to list
            emojis_in_row = [token for token in tokens if emoji_pattern.match(token)]
            for emoji in emojis_in_row:
                emoji_occurrences.append({'Emoji': emoji, 'Original_Index': original_idx})

        # Create a DataFrame of emoji occurrences
        emoji_occurrences_df = pd.DataFrame(emoji_occurrences)

        if emoji_occurrences_df.empty:
            # Skip this category if no emojis are found
            continue

        # Group by emoji to count occurrences
        emoji_summary_df = emoji_occurrences_df.groupby('Emoji').agg({
            'Original_Index': list
        }).reset_index()

        # Add a frequency column
        emoji_summary_df['Frequency'] = emoji_summary_df['Original_Index'].apply(len)

        # Merge with overall emoji counts
        merged_df = emoji_summary_df.merge(
            overall_emoji_counts,
            on='Emoji',
            how='left',
            suffixes=('_Filtered', '_Overall')
        )

        # Add normalized frequency
        merged_df['Normalized_Frequency'] = merged_df['Frequency'] / merged_df['Overall_Frequency']

        # Sort by normalized frequency and frequency
        merged_df = merged_df.sort_values(
            by=['Normalized_Frequency', 'Frequency'],
            ascending=[False, False]
        )

        # Get top 3 emojis
        top_emojis = merged_df.head(3)

        # Append results to the list
        for _, row in top_emojis.iterrows():
            results.append({
                'Category': category_name,
                'Emoji': row['Emoji'],
                'Normalized_Frequency': row['Normalized_Frequency'],
                'Frequency': row['Frequency']
            })

    # Create a summary DataFrame
    result_df = pd.DataFrame(results)
    return result_df

In [8]:
def reshape_results_with_percentage(result_df, df):
    """
    Reshape the result DataFrame to include the percentage of emoji appearances
    relative to the total number of rows in each category.

    Args:
    result_df (pd.DataFrame): DataFrame with Category, Emoji, Normalized_Frequency, and Frequency.
    df (pd.DataFrame): The original DataFrame for calculating percentages.

    Returns:
    pd.DataFrame: Reshaped DataFrame with one row per category, including percentages.
    """
    reshaped_data = []

    # Group by Category
    grouped = result_df.groupby('Category')
    for category, group in grouped:
        # Calculate the total number of rows for the current category
        category_condition = category.split("_")
        if category_condition[0] == "Sentiment":
            sentiment = int(category_condition[2])
            total_count_category = len(df[df['Sentiment_score'] == sentiment])
        elif category_condition[0] == "Emotion":
            emotion = category_condition[1]
            total_count_category = len(df[df['Sentiment_emotion'].str.lower() == emotion.lower()])
        elif category_condition[0] == "Part":
            pos = category_condition[1].lower()
            total_count_category = len(df[df['Part_of_speech'].str.lower() == pos])
        else:
            total_count_category = 0  # Default to 0 if no matching category condition is found

        # Calculate total percentage of rows classified for this category
        total_percentage = (total_count_category / len(df)) * 100 if len(df) > 0 else 0

        # Sort each group by Normalized_Frequency and Frequency
        group = group.sort_values(by=['Normalized_Frequency', 'Frequency'], ascending=[False, False])

        # Extract top 3 emojis with percentages
        top_emojis = []
        for _, row in group.head(3).iterrows():
            emoji = row['Emoji']
            frequency = row['Frequency']
            percentage = (frequency / total_count_category) * 100 if total_count_category > 0 else 0
            top_emojis.append(f"{emoji} ({percentage:.2f}%)")

        # Ensure exactly 3 emojis are listed
        while len(top_emojis) < 3:
            top_emojis.append(None)

        # Append to reshaped data
        reshaped_data.append({
            'Category': category,
            'Top Emoji': top_emojis[0],
            'Second Emoji': top_emojis[1],
            'Third Emoji': top_emojis[2],
            'Category Percentage': f"{total_percentage:.2f}%"
        })

    # Create a DataFrame from the reshaped data
    reshaped_df = pd.DataFrame(reshaped_data)
    return reshaped_df



In [9]:
filters = [
    ("Sentiment_score == 1", "Sentiment_Score_1"),
    ("Sentiment_score == 0", "Sentiment_Score_0"),
    ("Sentiment_score == -1", "Sentiment_Score_-1"),
    ("Sentiment_emotion.str.lower() == 'joy'", "Emotion_Joy"),
    ("Sentiment_emotion.str.lower() == 'surprise'", "Emotion_Surprise"),
    ("Sentiment_emotion.str.lower() == 'love'", "Emotion_Love"),
    ("Sentiment_emotion.str.lower() == 'anger'", "Emotion_Anger"),
    ("Sentiment_emotion.str.lower() == 'disgust'", "Emotion_Disgust"),
    ("Sentiment_emotion.str.lower() == 'sadness'", "Emotion_Sadness"),
    ("Sentiment_emotion.str.lower() == 'fear'", "Emotion_Fear"),
    ("Part_of_speech.str.lower() == 'noun'", "Part_NOUN"),
    ("Part_of_speech.str.lower()=='verb'", "Part_VERB"),
    ("Part_of_speech.str.lower()=='adj'", "Part_ADJ"),
]


# Original 10k Dataset (Ground-Truth)

In [None]:
df=pd.read_csv("../Data/Datasets/emojify_cleaned_10k_labelled.csv")

result_df = analyze_emojis_by_category(df, overall_emoji_counts, filters)


reshaped_df = reshape_results_with_percentage(result_df, df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,😡 (18.23%),😤 (16.02%),🤬 (4.42%),1.81%
1,Emotion_Disgust,🤔 (28.61%),😑 (5.00%),🤢 (2.50%),7.20%
2,Emotion_Fear,🤧 (11.05%),⚠ (4.36%),😨 (2.03%),3.44%
3,Emotion_Joy,🤣 (2.32%),😁 (2.25%),😎 (1.66%),42.71%
4,Emotion_Love,❤ (25.59%),💕 (9.63%),💜 (4.65%),20.87%
5,Emotion_Sadness,😢 (6.48%),💔 (6.21%),😔 (4.79%),11.27%
6,Emotion_Surprise,👇 (3.32%),👉 (2.85%),📍 (1.11%),12.65%
7,Part_ADJ,🤔 (5.58%),🙄 (5.31%),🙃 (2.55%),36.89%
8,Part_NOUN,📷 (2.24%),💓 (1.94%),🎶 (1.84%),29.42%
9,Part_VERB,🤪 (1.38%),️️ (0.10%),ㅋㅋ (0.07%),28.98%


# ELMO Model

In [11]:
elmo_df=pd.read_csv("./Model Prediction CSVs/ELMO_predictions.csv")

In [12]:
elmo_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,1,joy,NOUN
1,1,1,love,ADJ
2,2,1,love,ADJ
3,3,1,joy,NOUN
4,4,-1,anger,ADJ


In [13]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
elmo_merged_df = elmo_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
elmo_merged_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,1,love,ADJ,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,1,love,ADJ,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,1,joy,NOUN,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,-1,anger,ADJ,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [14]:
result_df = analyze_emojis_by_category(elmo_merged_df, overall_emoji_counts, filters)


reshaped_df = reshape_results_with_percentage(result_df, elmo_merged_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,😠 (3.92%),️8 (0.65%),️get (0.65%),1.62%
1,Emotion_Disgust,👎 (1.35%),🤦 (0.51%),➘ (0.17%),6.26%
2,Emotion_Fear,🤴 (0.34%),💀 (26.94%),🙃 (29.29%),3.15%
3,Emotion_Joy,😆 (0.79%),☀ (0.49%),🤩 (0.47%),45.40%
4,Emotion_Love,💰 (0.91%),🐰 (0.38%),💐 (0.34%),22.02%
5,Emotion_Sadness,😺 (0.18%),☂ (0.09%),️electricity (0.09%),11.57%
6,Emotion_Surprise,📝 (0.96%),♤ (0.53%),♧ (0.42%),9.98%
7,Part_ADJ,😇 (1.06%),🤫 (0.38%),😧 (0.29%),33.07%
8,Part_NOUN,☀ (0.76%),💰 (0.69%),💦 (0.54%),29.26%
9,Part_VERB,😷 (0.43%),😽 (0.13%),️️ (0.10%),31.96%


# GPT-2 Model

In [15]:
gpt_df=pd.read_csv("./Model Prediction CSVs/GPT2_predictions.csv")

In [16]:
gpt_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,1,joy,NOUN
1,1,1,joy,NOUN
2,2,1,joy,VERB
3,3,1,joy,NOUN
4,4,-1,joy,VERB


In [17]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
gpt_merged_df = gpt_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
gpt_merged_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,1,joy,NOUN,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,1,joy,VERB,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,1,joy,NOUN,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,-1,joy,VERB,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [18]:
result_df = analyze_emojis_by_category(gpt_merged_df, overall_emoji_counts, filters)


reshaped_df = reshape_results_with_percentage(result_df, gpt_merged_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Joy,😂 (13.70%),😍 (4.39%),😊 (2.85%),76.57%
1,Emotion_Love,💚 (1.96%),💘 (1.34%),❗ (1.06%),17.85%
2,Emotion_Sadness,😩 (34.77%),😭 (65.23%),,5.58%
3,Part_NOUN,🤔 (4.13%),💕 (4.03%),🙄 (3.93%),49.82%
4,Part_VERB,😌 (1.30%),🙂 (1.00%),😕 (0.96%),50.18%
5,Sentiment_Score_-1,😕 (2.85%),😪 (2.50%),😬 (2.20%),16.83%
6,Sentiment_Score_0,👑 (6.12%),👁 (1.06%),🧀 (1.06%),3.76%
7,Sentiment_Score_1,😂 (13.21%),❤ (6.72%),😍 (4.23%),79.41%


# ADA

In [19]:
ada_df=pd.read_csv("./Model Prediction CSVs/ADA_predictions.csv")

In [20]:
ada_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,0,joy,NOUN
1,1,0,surprise,NOUN
2,2,1,love,ADJ
3,3,1,joy,NOUN
4,4,-1,anger,ADJ


In [21]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
ada_merged_df = ada_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
ada_merged_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,0,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,0,surprise,NOUN,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,1,love,ADJ,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,1,joy,NOUN,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,-1,anger,ADJ,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [22]:
result_df = analyze_emojis_by_category(ada_merged_df, overall_emoji_counts, filters)


reshaped_df = reshape_results_with_percentage(result_df, ada_merged_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,💤 (5.41%),😠 (4.05%),🖕 (2.70%),1.64%
1,Emotion_Disgust,🤷 (1.23%),🤮 (1.05%),🗑 (0.18%),6.33%
2,Emotion_Fear,😵 (2.17%),🚮 (1.24%),💧 (0.62%),3.59%
3,Emotion_Joy,😇 (0.85%),⚡ (0.52%),🤩 (0.52%),43.03%
4,Emotion_Love,🌹 (0.97%),💌 (0.48%),🌺 (0.43%),20.65%
5,Emotion_Sadness,😦 (0.68%),ⓒ (0.10%),️follow (0.10%),11.52%
6,Emotion_Surprise,📹 (1.09%),😯 (0.67%),👻 (0.59%),13.23%
7,Part_ADJ,😇 (1.02%),😚 (0.25%),😯 (0.25%),35.85%
8,Part_NOUN,💰 (0.74%),🌹 (0.70%),☆ (0.55%),28.39%
9,Part_VERB,🤩 (0.72%),️️ (0.11%),🙀 (0.07%),30.82%


# BERT Model

In [27]:
bert_df=pd.read_csv("./Model Prediction CSVs/BERT_predictions.csv")

In [28]:
bert_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,1,joy,NOUN
1,1,1,joy,VERB
2,2,-1,joy,VERB
3,3,-1,joy,ADJ
4,4,1,joy,VERB


In [29]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
bert_merged_df = bert_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
bert_merged_df.head()


Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,1,joy,VERB,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,-1,joy,VERB,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,-1,joy,ADJ,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,1,joy,VERB,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [30]:
result_df = analyze_emojis_by_category(bert_merged_df, overall_emoji_counts, filters)


reshaped_df = reshape_results_with_percentage(result_df, bert_merged_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,🏈 (100.00%),,,0.01%
1,Emotion_Fear,😮 (25.00%),💀 (25.00%),🙄 (25.00%),0.04%
2,Emotion_Joy,☆ (0.17%),👋 (0.16%),🎂 (0.14%),81.82%
3,Emotion_Love,🐺 (0.12%),👼 (0.12%),🦊 (0.12%),16.83%
4,Emotion_Sadness,🇧🇩 (3.85%),💃 (3.85%),👎 (3.85%),0.26%
5,Emotion_Surprise,ㅂ (0.98%),🐿 (0.98%),🇬🇧 (1.96%),1.04%
6,Part_ADJ,⌒ (0.18%),『 (0.18%),』 (0.18%),5.78%
7,Part_NOUN,☆ (0.29%),🌈 (0.14%),💡 (0.12%),49.93%
8,Part_VERB,⠀ (0.09%),🌲 (0.09%),🔵 (0.09%),44.25%
9,Sentiment_Score_-1,♫ (0.10%),／ (0.07%),＼ (0.07%),29.26%


# CBOW Model

In [23]:
cbow_df=pd.read_csv("./Model Prediction CSVs/CBOW_predictions.csv")

In [24]:
cbow_df.head()

Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech
0,0,1,joy,NOUN
1,1,0,surprise,NOUN
2,2,1,love,ADJ
3,3,1,joy,NOUN
4,4,-1,anger,ADJ


In [25]:
# Merge cbow_df with original, keeping only the 'Emoji' column from original
cbow_merged_df = cbow_df.merge(original[['idx', 'Tokens']], on='idx', how='left')

# Display the merged DataFrame
cbow_merged_df.head()



Unnamed: 0,idx,Sentiment_score,Sentiment_emotion,Part_of_speech,Tokens
0,0,1,joy,NOUN,"['if', 'not', 'later', ',', 'when', '?', '🍑']"
1,1,0,surprise,NOUN,"['🔗', '|', 'the', 'izombie', '4x08', 'page', 'is', 'now', 'up', '!', 'containing', 'trivia', ',', 'quotes', ',', 'reviews', ',', 'a', 'look', 'at', ""liv's"", 'style', ',', 'comic', 'slides', 'gifs', ',', 'the', 'b', '…']"
2,2,1,love,ADJ,"['hearing', 'bts', 'at', 'work', 'still', 'amazes', 'me', '😍']"
3,3,1,joy,NOUN,"['ayyy', 'this', 'is', 'lit', '🔥']"
4,4,-1,anger,ADJ,"['well', 'richard', 'i', 'jumped', 'in', 'the', 'shower', 'and', 'saved', 'her', 'so', 'back', 'off', '😡']"


In [26]:
result_df = analyze_emojis_by_category(cbow_merged_df, overall_emoji_counts, filters)


reshaped_df = reshape_results_with_percentage(result_df, cbow_merged_df)

# Display the reshaped DataFrame
reshaped_df

Unnamed: 0,Category,Top Emoji,Second Emoji,Third Emoji,Category Percentage
0,Emotion_Anger,🙏 (44.36%),😡 (24.81%),😤 (21.80%),1.33%
1,Emotion_Disgust,🤔 (32.70%),🙄 (31.11%),😒 (12.38%),6.30%
2,Emotion_Fear,🙃 (27.81%),💀 (25.15%),🤧 (11.24%),3.38%
3,Emotion_Joy,😂 (19.19%),❤ (9.77%),😊 (3.99%),54.67%
4,Emotion_Love,😍 (24.17%),💕 (14.46%),😘 (8.85%),13.90%
5,Emotion_Sadness,😭 (33.42%),😩 (17.47%),😢 (6.51%),11.22%
6,Emotion_Surprise,👀 (16.74%),📷 (7.17%),🗣 (5.33%),9.20%
7,Part_ADJ,😊 (6.48%),🤔 (6.13%),🙄 (5.83%),33.62%
8,Part_NOUN,💕 (5.16%),🔥 (3.77%),♥ (2.21%),38.97%
9,Part_VERB,😂 (40.41%),😭 (14.45%),😩 (7.55%),25.96%
