# Preprocessing

In [1]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [1]:
import pandas as pd
import re
import emoji
import os

In [2]:
# read our first dataset: the emoji_map
emoji_map = pd.read_csv("emoji_map.csv")

In [3]:
# look at the necessary columns
emoji_map[["title","ucode_short", "ucode"]]

Unnamed: 0,title,ucode_short,ucode
0,keycap number sign,0023_fe0f_20e3,#️⃣
1,keycap asterisk,002a_fe0f_20e3,*️⃣
2,keycap digit zero,0030_fe0f_20e3,0️⃣
3,keycap digit one,0031_fe0f_20e3,1️⃣
4,keycap digit two,0032_fe0f_20e3,2️⃣
...,...,...,...
1786,lizard,1f98e,🦎
1787,rhinoceros,1f98f,🦏
1788,shrimp,1f990,🦐
1789,squid,1f991,🦑


In [4]:
# we stored all the csv's for the emoji tweets in a data folder
data_folder = "data"

file_names = os.listdir(data_folder)

# list of CSV files corresponding to the 42 emojis
emoji_csv_files = [f for f in file_names if f.endswith('.csv')]

# create a list to hold the merged data
merged_data = []

# iterate through each emoji CSV file to merge it
for file_name in emoji_csv_files:
    emoji_title = file_name.split('.')[0]
    emoji_text_df = pd.read_csv(os.path.join(data_folder, file_name), header=None, names=["text"], lineterminator='\n')
    emoji_text_df['title'] = emoji_title
    emoji_text_df = pd.merge(emoji_text_df, emoji_map, on="title", how="left")
    merged_data.append(emoji_text_df)

final_merged_df = pd.concat(merged_data, ignore_index=True)
print("Merged emoji dataset saved successfully!")

Merged emoji dataset saved successfully!


In [5]:
# take only necessary columns
final_merged_df = final_merged_df[['text', 'title', 'ucode_short', 'ucode']]

# change the text, title, and ucode_short to type string for the cleaning process
final_merged_df['text'] = final_merged_df['text'].astype(str)
final_merged_df['title'] = final_merged_df['title'].astype(str)
final_merged_df['ucode_short'] = final_merged_df['ucode_short'].astype(str)

# drop all the NA values that came from the merge 
final_merged_df = final_merged_df.dropna()

In [6]:
def clean_text(text):
    # convert tweet to lowercase
    text = text.lower()
    # take out all the mentions (ex. @username)
    text = re.sub(r'@[\w]+', '', text)  # this removes anything starting with @
    # take out emojis
    text = emoji.replace_emoji(text, replace='') 
    # take out non-alphanumeric characters (which means keep letters and numbers)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # take out the extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# apply this to the final dataframe
final_merged_df['cleaned_text'] = final_merged_df['text'].apply(clean_text)

In [7]:
final_merged_df

Unnamed: 0,text,title,ucode_short,ucode,cleaned_text
0,Text,egg,1f95a,🥚,text
1,Happy Easter😀🐇🥚,egg,1f95a,🥚,happy easter
2,@elonmusk @teslaownersSV #eggs Easter eggs tod...,egg,1f95a,🥚,eggs easter eggs today will surely be deliciou...
3,We hope everyone has a Hoppy Easter! 😉🐰🥚🐣🌷\n\n...,egg,1f95a,🥚,we hope everyone has a hoppy easter happyeaste...
4,🐰🥚 Get ready to hop into the most egg-citing d...,egg,1f95a,🥚,get ready to hop into the most eggciting digit...
...,...,...,...,...,...
840037,Happy Easter 🐣 💓 from my brother ❤️ Armen Grig...,hatching chick,1f423,🐣,happy easter from my brother armen grigoryan s...
840038,"Have a great finish to your weekend, Happy Eas...",hatching chick,1f423,🐣,have a great finish to your weekend happy east...
840039,"Beef ribs, smothered turkey wings, cheesy pota...",hatching chick,1f423,🐣,beef ribs smothered turkey wings cheesy potato...
840040,Happy Easter! Hope you all have an amazing day...,hatching chick,1f423,🐣,happy easter hope you all have an amazing day ...


In [8]:
# save our final merged dataframe to a csv
final_merged_df.to_csv("merged_emoji_data.csv", index=False)