# Emojis Speak More than Words

GOAL: 
    1. give an "issue word" as an input (ex. ocasio, climate change) and find the most related emoji
    to kinda grasp people's opinions
    2. give any word or a saying and get a emoji that is most related ex. sparkle --> ✨

Let's make a columns for emojis and its corresponding tweets

In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict

In [2]:
import re
import nltk
import nltk.tokenize as tk
import en_core_web_sm
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/sara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords

nlp = en_core_web_sm.load()

stopwords = stopwords.words('english')

In [4]:
tweets = list(pickle.load(open('./data/yay_moji.pkl','rb')))
mojis = pd.read_pickle('./data/df_emoji.pkl')

In [5]:
tweets[1]

'RT @Fancy2Nancy3: 🚨 ATTN  PATRIOTS 🚨 \nPlease Retweet &amp; Follow        🇺🇸@Commonm69164249🇺🇸        \n    🎉 Help Reach 🎉\n🔥5K FOLLOWERS 🔥 \n🇺🇸 A…'

In [6]:
stopwords.extend(['\'s','’s','rt','…','️','...'])

In [7]:
# Clean text 
punctuations = string.punctuation

# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation
def cleanup_text(docs, logging=False):
    texts = []
    counter = 1
    for doc in docs:
        if counter % 1000 == 0 and logging:
            print("Processed %d out of %d documents." % (counter, len(docs)))
        counter += 1
        doc = nlp(doc, disable=['parser', 'ner'])
        tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = ' '.join(tokens)
        tokens = re.sub('@[^\s]+','', tokens)
        texts.append(tokens)
    return pd.Series(texts)

In [21]:
#tweets = pd.DataFrame(tweets, columns=['tweet'])
#tw = [word for word in tweets['tweet']]

# Clean up all text
tw_clean = cleanup_text(tweets)

In [9]:
letters = '🇦 🇧 🇨 🇩 🇪 🇫 🇬 🇭 🇮 🇯 🇰 🇱 🇲 🇳 🇴 🇵 🇶 🇷 🇸 🇹 🇺 🇻 🇼 🇽 🇾 🇿'.split()

In [10]:
flags = mojis['unichar'][1458:]

In [24]:
# change letters to flags
def fix_flags(tweets):
    fixed = []
    for tweet in tweets:
        for l in letters:
            if l in tweet:
                tweet = re.sub(l+" ", l, tweet)
        fixed.append(tweet)
    return(fixed)

In [25]:
tw_clean_flags = fix_flags(tw_clean)

In [26]:
# put space after flag emojis
def flags_space(tweets):
    fixed = []
    for tweet in tweets:
        for l in flags:
            if l in tweet:
                tweet = re.sub(l, l+" ", tweet)
        fixed.append(tweet)
    return(fixed)

In [27]:
tw_cleaned = flags_space(tw_clean_flags)

In [29]:
# complete list of emojis
from emoji import UNICODE_EMOJI

In [86]:
emojis = list(UNICODE_EMOJI.keys())

In [54]:
def extract_mojis(tweets):
    emoji = defaultdict(list)

    for i, tweet in enumerate(tweets):
        for word in tweet.split():
            if word in emojis:
                emoji['emoji'].append(word)
                emoji['index'].append(i)
    
    # delete overlapping emojis in a tweet
    emoji = pd.DataFrame(emoji).drop_duplicates()
    
    return(emoji)

In [55]:
extracted = extract_mojis(tw_cleaned)

In [71]:
extracted

Unnamed: 0,emoji,index
0,😂,0
1,🚨,1
3,🇺🇸,1
5,🎉,1
7,🔥,1
10,🤯,2
11,✌,3
13,⚽,3
15,🔴,3
17,▶,3


In [95]:
t = tw_cleaned[1]
t = t.split()
w = []
for i in t:
    if i not in list(extracted['emoji']):
        w.append(i)
' '.join(w)

'attn patriots please retweet amp follow help reach 5 k followers'

In [98]:
def remove_emojis(tweets):
    for tweet in tweets:
        tweet = tweet.split()
        words = []
        no_emojis = []
        
        for word in tweet:
            if word not in list(extracted['emoji']):
                words.append(i)
        no_emojis.append(' '.join(words))
    return(no_emojis)

In [None]:
remove_emojis(tw_cleaned)

In [74]:
extracted['emoji']

0        😂
1        🚨
3       🇺🇸
5        🎉
7        🔥
10       🤯
11       ✌
13       ⚽
15       🔴
17       ▶
19       😍
20       🥰
21       😂
25       😍
26      🇸🇦
27       🤤
28       🔥
29       🍉
30       🍓
31       🍍
32       🥭
33       🍒
34       🥒
35       😍
36       💙
39      🇺🇸
42       🎵
43      🇺🇸
45       🤯
46       🤟
        ..
8245     🔊
8246    🇺🇸
8247     😂
8248     👸
8250     💕
8251    🇧🇷
8252    🇺🇸
8253     ❤
8254     💚
8255     🤔
8256     💗
8257     💖
8258     💜
8259     ♥
8260     🌟
8261     🙏
8263     💥
8266     ®
8268     🔥
8269     😂
8270     ♾
8271     🤭
8272     💪
8273     🤯
8274     😈
8275     ❤
8276     🌊
8277     🏏
8278     😎
8281     ⭐
Name: emoji, Length: 5930, dtype: object

In [57]:
no_emojis = []
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
for tweets in tw_clean:
    no_emojis.append(emoji_pattern.sub(r'', tweets)) # no emoji

In [58]:
no_emojis

['john daly actually ride cart major  man live good life',
 '  attn patriots  please retweet amp follow       help reach   5 k followers   ',
 'actual baffle people find fully acceptable make facebook instagram pet \U0001f92f',
 ' beinsport ✌ ✌ النٌصر_البٌاطًن ⚽ ⚽  الهلٌال_الشبُاًب ⚽ live stream link ▶ 𝐋𝐢𝐯𝐞 𝐇𝐞𝐫𝐞&gt;&gt;&gt ▶ https://',
 'show church wedding dress  \U0001f970 https://t.co/hzop8qxqfn',
 'oh shit go     https://t.co/rxbvwbqo2v',
 ' love madeleine fan ',
 ' clip many video spread watsab celebrate eid saudi   prison soldier officer dan',
 ' 🤤  chamoy lover    \U0001f96d  🥒  2113 bandera rd san antonio tx iceicebabysa https://t.co/0crszsz16n',
 ' hello k9 strong busy productive day support k9 k9livesmatter         ',
 ' route preview trailer mozart check  ikevamp ikemenvampire https://t.co/eceo6aab1h',
 '   usa tickets tomorrow   https://t.co/bousnphegd',
 ' absolutely unreal two people meet twitch chat get married today \U0001f92f congrats  su',
 ' long day draining thankfu

In [274]:
pd.DataFrame({'tweets': tw_clean})

Unnamed: 0,tweets
0,john daly actually ride cart major 😂 man live ...
1,🚨 attn patriots 🚨 please retweet amp follow 🇺...
2,actual baffle people find fully acceptable mak...
3,beinsport ✌ ✌ النٌصر_البٌاطًن ⚽ ⚽ 🔴 الهلٌال_ا...
4,show church wedding dress 😍 🥰 https://t.co/hzo...
5,oh shit go 😂 😂 😂 😂 https://t.co/rxbvwbqo2v
6,love madeleine fan 😍
7,clip many video spread watsab celebrate eid s...
8,🤤 🔥 chamoy lover 🍉 🍓 🍍 🥭 🍒 🥒 😍 2113 bandera r...
9,hello k9 strong busy productive day support k...
