In [1]:
import pandas as pd
import bar_chart_race as bcr
from pyspark.sql import SparkSession

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
def get_emoji_name(emoji):
    print(emoji)
    soup = BeautifulSoup(requests.get('https://emojipedia.org/%s' % emoji).content, "html.parser")
    return ' '.join(soup.find('h1').text.split()[1:])

In [4]:
spark = (
    SparkSession.builder
    .appName("twitter-emoji-analysis")
    .config("spark.driver.memory", "512m")
    .config("spark.driver.cores", "1")
    .config("spark.executor.memory", "512m")
    .config("spark.executor.cores", "1")
    .config("spark.sql.shuffle.partitions", "2")
    .getOrCreate()
)

In [5]:
spark_df = spark.read.parquet("/dataset/streaming.parquet")
spark_df.createOrReplaceTempView("twitter_emojis")

In [8]:
df = spark.sql('''
with grouped_emojis as (
select 
    ds || ' ' || left(hour,2) as hour,
    emoji,
    count(distinct tweet_id) tweets
from twitter_emojis
where (ds > '2021-10-10' or (ds == '2021-10-10' and hour >= '12')) and cast(hour as string) like '__:__'
group by 1,2
), ranked_emojis as (
    select 
    *,
    row_number() over (partition by hour order by tweets desc) rank
from grouped_emojis
)
select hour as time,
    emoji,
    tweets,
    rank
from ranked_emojis
where rank <= 10
''').toPandas()

In [9]:
emoji_dict = {e:get_emoji_name(e) for e in df["emoji"].drop_duplicates().tolist()}

👏
🔥
🗣
⚽
🎶
🙌
🟨
🟦
💦
📢
❤
🥵
😈
🤤
🤫
😍
😂
💛
👉
‼️
🇸🇪
☎️
📊
❤️
💙
🔴
🚨
🍆
✨
♀️
😁
🙋
🛳️
🐎
🎁
🎫
➡️
🏆
💧
➕
👨‍👩‍👧
👀
📽️
🥰
🗳️
🇵🇾
🔹
🏴
🔵
🤣
😌
🌟
👤
😭
😸
💬
💭
🐬
🍰


In [10]:
df["emoji_name"] = df["emoji"].apply(lambda x: emoji_dict.get(x))

In [11]:
%%capture --no-display
data = pd.pivot_table(
    index="time",
    columns="emoji_name",
    values="tweets",
    data = df
).fillna(0)

bcr.bar_chart_race(
    df=data,
    n_bars=10,
    orientation='h',
    sort='desc',
    title='Top 10 emojis for tweets in CABA containing "Alberto Fernandez"',
    filename='bcr_alberto_fernandez.mp4'
)

In [12]:
%%capture --no-display
data = pd.pivot_table(
    index="time",
    columns="emoji_name",
    values="tweets",
    data = df
).fillna(0).cumsum()

bcr.bar_chart_race(
    df=data,
    n_bars=10,
    orientation='h',
    sort='desc',
    title='Top 10 emojis for tweets in CABA containing "Alberto Fernandez"',
    filename='bcr_cumulative_alberto_fernandez.mp4'
)