# Emojis Speak More than Words

GOAL: 
    1. give an "issue word" as an input (ex. ocasio, climate change) and find the most related emoji
    to kinda grasp people's opinions
    2. give any word or a saying and get a emoji that is most related ex. sparkle --> ✨

Let's make a columns for emojis and its corresponding tweets

In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict

In [2]:
import re
import nltk
import nltk.tokenize as tk
import en_core_web_sm
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/sara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords

nlp = en_core_web_sm.load()

stopwords = stopwords.words('english')

In [66]:
tweets = list(pickle.load(open('./data/yay_moji.pkl','rb')))
mojis = pd.read_pickle('./data/df_emoji.pkl')

In [5]:
tweets[1]

'RT @Fancy2Nancy3: 🚨 ATTN  PATRIOTS 🚨 \nPlease Retweet &amp; Follow        🇺🇸@Commonm69164249🇺🇸        \n    🎉 Help Reach 🎉\n🔥5K FOLLOWERS 🔥 \n🇺🇸 A…'

In [6]:
stopwords.extend(['\'s','’s','rt','…','️','...','follow', 'dm', 'https', 'ur', 'll' ,'amp', 'subscribe', 'don', 've', 'retweet', 'im', 'http'])

In [7]:
# Clean text 
punctuations = string.punctuation

# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation
def cleanup_text(docs, logging=False):
    texts = []
    counter = 1
    for doc in docs:
        if counter % 1000 == 0 and logging:
            print("Processed %d out of %d documents." % (counter, len(docs)))
        counter += 1
        doc = nlp(doc, disable=['parser', 'ner'])
        tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = ' '.join(tokens)
        tokens = re.sub('@[^\s]+','', tokens)
        texts.append(tokens)
    return pd.Series(texts)

In [8]:
#tweets = pd.DataFrame(tweets, columns=['tweet'])
#tw = [word for word in tweets['tweet']]

# Clean up all text
tw_clean = cleanup_text(tweets)

In [9]:
letters = '🇦 🇧 🇨 🇩 🇪 🇫 🇬 🇭 🇮 🇯 🇰 🇱 🇲 🇳 🇴 🇵 🇶 🇷 🇸 🇹 🇺 🇻 🇼 🇽 🇾 🇿'.split()

In [10]:
flags = mojis['unichar'][1458:]

In [11]:
# change letters to flags
def fix_flags(tweets):
    fixed = []
    for tweet in tweets:
        for l in letters:
            if l in tweet:
                tweet = re.sub(l+" ", l, tweet)
        fixed.append(tweet)
    return(fixed)

In [12]:
tw_clean_flags = fix_flags(tw_clean)

In [13]:
# put space after flag emojis
def flags_space(tweets):
    fixed = []
    for tweet in tweets:
        for l in flags:
            if l in tweet:
                tweet = re.sub(l, l+" ", tweet)
        fixed.append(tweet)
    return(fixed)

In [14]:
tw_cleaned = flags_space(tw_clean_flags)

In [15]:
# complete list of emojis
from emoji import UNICODE_EMOJI

In [16]:
emojis = list(UNICODE_EMOJI.keys())

In [17]:
def extract_mojis(tweets):
    emoji = defaultdict(list)

    for i, tweet in enumerate(tweets):
        for word in tweet.split():
            if word in emojis:
                emoji['emoji'].append(word)
                emoji['index'].append(i)
    
    # delete overlapping emojis in a tweet
    emoji = pd.DataFrame(emoji).drop_duplicates()
    
    return(emoji)

In [18]:
extracted = extract_mojis(tw_cleaned)

In [46]:
def remove_emojis(tweets):
    no_emojis = []
    
    for tweet in tweets:
        tweet = tweet.split()
        words = []
        
        for word in tweet:
            if word not in list(extracted['emoji']):
                words.append(word)
        words = ' '.join(words)
        no_emojis.append(words)
    return(no_emojis)

In [49]:
tw_no_emo = remove_emojis(tw_cleaned)

There are still some unremoved emojis but I will come back to that later. Maybe try tdidf...

In [87]:
tweets_df = pd.DataFrame(tw_no_emo, columns = ['tweets'])

In [134]:
tweets_merged = pd.merge(extracted, tweets_df.reset_index(), on='index', how='left')

In [179]:
# emojis with at least 10 tweets
enough_emoji = tweets_merged.groupby('emoji').count()[tweets_merged.groupby('emoji').count()['tweets']>=10]
enough_emoji = pd.merge(enough_emoji.reset_index()[['emoji']], tweets_merged, on='emoji', how='left')

In [190]:
tweets_merged.head()

Unnamed: 0,emoji,index,tweets
0,😂,0,john daly actually ride cart major man live go...
1,🚨,1,attn patriots please retweet amp follow help r...
2,🇺🇸,1,attn patriots please retweet amp follow help r...
3,🎉,1,attn patriots please retweet amp follow help r...
4,🔥,1,attn patriots please retweet amp follow help r...


In [137]:
tweets_merged['emoji'].values

array(['😂', '🚨', '🇺🇸', ..., '🏏', '😎', '⭐'], dtype=object)

In [111]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
import math
from collections import Counter
import re

def find_emoji(word,df_emoji):
    options = []
    for i, name in enumerate(df_emoji['short_name']):
        if word in name:
            options.append((name,df_emoji['unichar'][i]))
            print(name,df_emoji['unichar'][i])
    return options

def get_emojis(tweet_lst,df_emoji):
    emoji_idx = []
    emoji_char =[]
    for tweet in tweet_lst:
        for i,uni in enumerate(df_emoji['unichar']):
            if uni in tweet:
                 emoji_idx.append(i)
                 emoji_char.append(uni)
    return emoji_idx, emoji_char

def get_emojis_by_tweet(tweet_lst,df_emoji):
    by_tweet = []

    for tweet in tweet_lst:
        emoji_char =[]
        for i,uni in enumerate(df_emoji['unichar']):
            if uni in tweet:
                 emoji_char.append(uni)
        by_tweet.append(emoji_char)
    return by_tweet

#this seems to get some emojis that i dont but also missed some that i do. It also get duplicates per tweet that i dont
# def get_emojis_2(tweet_lst):
#     emojis = []
#     for tweet in tweet_lst:
#         emoji = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
#         emojis.append(emoji.findall(tweet))
#     return emojis

def print_emoji(tweet,emoji_char):
    for uni in emoji_char:
        if uni in tweet:
            print(uni)


if __name__ == '__main__':
    tweets = np.array(list(pickle.load(open('./data/yay_moji.pkl','rb'))))
    # type is list

    emojis = pd.read_pickle('./data/df_emoji.pkl')
    # type is DataFrame



    # ------------- tfidf
    stopwords = set(list(ENGLISH_STOP_WORDS) + ['rt', 'follow', 'dm', 'https', 'ur', 'll' ,'amp', 'subscribe', 'don', 've', 'retweet', 'im', 'http'])

    tfidf = TfidfVectorizer(max_features=10000, max_df=0.05, min_df=0.001, stop_words = stopwords, ngram_range = (1,2))

    #lemmetizing need to consider cleaning the tweets myself

    tfidf_tweets = tfidf.fit_transform(tweets)
    bag = np.array(tfidf.get_feature_names())

    # -------------- NMF
    k = 10
     #number of groups
    nmf2 = NMF(n_components = k)
    nmf2.fit(tfidf_tweets)
    W = nmf2.transform(tfidf_tweets) #len(yay_moji,k)
    H = nmf2.components_ #k,len(yay_moji)


    # --------------- Printing Top 10
    tweet_lst = []
    top = 10
    tweet_in_group_thresh = .005 #score thresh if we consider that tweet as part of that group
    for group in range(k):
        #idx of the top ten words for each group
        i_words = np.argsort(H[group])[::-1][:top]
        words = bag[i_words]

        # idx of the top ten tweets for each group
        i_emojis = np.argsort(W[:,group])[::-1][:top]
        # most common 10 emojis for each group

        print('-'*10)
        print('Group:',group)
        counted_tweets = np.argwhere(W[:,group] > tweet_in_group_thresh)
        print(counted_tweets.shape[0], 'tweets')
        for word in words:
            print('-->',word)
        for i_tweet in i_emojis:
            print_emoji(tweets[i_tweet], emojis['unichar'])
            tweet_lst.append(tweets[i_tweet])
        ind, emo_lst = get_emojis(tweets[i_emojis],emojis)
        # find percentage of emoji per group
        most_emoji, how_many = Counter(emo_lst).most_common(1)[0]
        score = float(how_many)/top
        # print score #score is not perfect - similar emojis and repeat in the same tweet
        print('\n')

    # --------------- printing most common emojis
    most_common = 50
    b,all_emojis = get_emojis(tweets,emojis)
    count = Counter(all_emojis).most_common(most_common)
    unicode_top = []
    for emo, i in count:
        print(emo,i)
        for j, char in enumerate(emojis['unichar']):
            if char == emo:
                unicode_top.append(emojis['unichar'][j])


# test stuff
    jan = get_emojis_by_tweet(tweets[0:100],emojis)
    for tweet in jan:
        for emo in tweet:
            print(emo,)
        print('')
    # name_of = find_emoji('heart',emojis)


    '''
    to do's
    --> how big are the groups? do a most common
    --> get a better score system
    --> allow for tweets with multiple emojis
    --> sub set for tweets with a specific emoji
    --> commonly combined emojis
    --> naive bayes
        prediction accuracy between emojis for how similar they are
    whats the purpose:
    --> to help use emojis as labels for tweets
    '''

----------
Group: 0
256 tweets
--> love
--> yes
--> gonna
--> cute
--> life
--> people
--> amazing
--> guys
--> friends
--> page
💛
🇺🇸
🙊
♥
😍
🤯
♥
🇺🇸
😁
💯
🐶
😂
🇧🇩
🇩🇪
🇬🇧
😍
☺
🤗
🙈
🙉
🙊
💕
❤
💛
💙
💜
💫
🐶
🦄
🐻
🐣
🌸
🌺
🌞
⭐
🌈
☃
✨
😍
💃
😍


----------
Group: 1
749 tweets
--> just
--> need
--> think
--> man
--> really
--> shit
--> woke
--> wow
--> just like
--> little
🤯
😂
🇺🇸
🤢
🌱
🙊
😂
❤
🙏
🇺🇸
💎
🇩🇪
🤯


----------
Group: 2
902 tweets
--> like
--> people
--> got
--> just like
--> feel
--> hurt
--> cute
--> look
--> change
--> feel like
😂
😱
🤯
🙈
🙊
💙
🙊
😂
☺
💜
🐯
😅
😢
😂
🤣


----------
Group: 3
778 tweets
--> good
--> morning
--> good morning
--> know
--> better
--> news
--> luck
--> night
--> hope
--> post_abortive87
😩
👍
🇺🇸
⭐
🥺
🥺
🙊
😂
🙄
😎
🤣
😂
🏁
🇺🇸
🌹
🇰🇷
🇸🇦


----------
Group: 4
1091 tweets
--> trump
--> realdonaldtrump
--> president
--> america
--> god
--> president trump
--> 2020
--> people
--> god bless
--> bless
🇺🇸
🇺🇸
👍
👍
🇺🇸
🤦
🤦‍♀️
♀
😂
🇺🇸
❤
👍
🙏
🇺🇸
🇺🇸
🇺🇸


----------
Group: 5
495 tweets
--> vegan
--> food
--> let
--> frie

In [189]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split


class Emoji(object):

    def __init__(self):
        # fit the Naive Bayes
        np.random.seed(42)
        self.emojis = pd.read_pickle('./data/df_emoji.pkl')

    def fit(self):
        '''
        # ------- this part needs work
        try:
            self.labeled_tweets = pd.read_pickle('../data/labeled.pkl')
            print('it worked')
        except:
            from label_tweets import label_tweets
            tweets = np.array(list(pickle.load(open('./data/yay_moji.pkl','rb'))))
            self.by_emoji,self.labeled_tweets = label_tweets(tweets,self.emojis,top = 50, save = True)
        '''
        self.y = tweets_merged['emoji']
        self.X = tweets_merged['tweets'].values


    def model(self, max_df_ = .8, min_df_ = .001, ngram = (1,2)):

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X,self.y)

        stopwords = set(list(ENGLISH_STOP_WORDS) + ['rt', 'follow', 'dm', 'https', 'ur', 'll' ,'amp', 'subscribe', 'don', 've', 'retweet', 'im', 'http','lt'])

        # fit the tfidf or CountVectorizer
        self.tfidf = TfidfVectorizer(max_features=10000, max_df = max_df_, min_df=min_df_, stop_words = stopwords, ngram_range = ngram)

        self.tfidf.fit(self.X_train)
        self.vector = self.tfidf.transform(self.X_train)

        # --> add the emoji name to bag of words for each emoji
        self.bag = np.array(self.tfidf.get_feature_names())

        self.nb = GaussianNB()
        self.nb.fit(self.vector.todense(), self.y_train)

    def internal_predict(self, print_side_by_side = True):
        test_tfidf = self.tfidf.transform(self.X_test)
        predicted = self.nb.predict(test_tfidf.todense())
        print('labeled')
        acc = np.mean(self.y_test == predicted)

        print('Test accuracy =',acc)
        print('')

        if print_side_by_side:
            for true,predict in zip(self.y_test,predicted):
                print('-->',true,predict)


    def predict(self,text):
        top_n = 3
        test_tfidf = self.tfidf.transform([text])
        probs = self.nb.predict_proba(test_tfidf.todense())
        probs = probs.flatten()
        above_0 = np.argwhere(probs>0).flatten()
        above_0 = np.sort(above_0)[::-1]
        print('-->',text,'=',)
        
        for i in above_0[:5]:
            print(self.nb.classes_[i],' ', probs.flatten()[i],' ',)
        print('')
        return(probs)

    def print_top_words(self,top_n_words=5):
        # printing top words for each emoji
        print('')
        print('----- Top {} words for each Emoji in Train set'.format(top_n_words))
        print('-'*60)
        for i in range(len(self.nb.classes_)):
            top =  self.bag[self.nb.theta_[i].argsort()[::-1]][:top_n_words]
            print(self.nb.classes_[i],' -->',top)
        print('')



if __name__ == '__main__':

    # run clean_tweets
    # run labeled_tweets

    emo = Emoji()
    emo.fit()
    emo.model()
    b = emo.predict('ocasio')
    c = emo.predict('climate change')
    d = emo.predict('vegan')
    e = emo.predict('earth')
    e = emo.predict('greta')
    e = emo.predict('korea')
    e = emo.predict('abortion')
    e = emo.predict('love you')
    e = emo.predict('birthday')
    a = emo.predict('i want a divorce')
    e = emo.predict('life')
    e = emo.predict('baby')
    e = emo.predict('basketball')
    e = emo.predict('i need coffee')
    e = emo.predict('la la land')
    e = emo.predict('netflix and chill')
    e = emo.predict('i am thankful to be alive')
    e = emo.predict('ocean')
    e = emo.predict('trump')
    e = emo.predict('plastic')
    e = emo.predict('boyfriend')

    # emo.internal_predict(print_side_by_side = True)
    emo.print_top_words(5)

--> ocasio =
🧹   3.2315936025548407e-40  
🧡   5.134160423670935e-69  
🧐   3.7119745947176606e-35  
🦓   0.11111088111755894  
🦋   9.340148717550855e-83  

--> climate change =
🤒   6.634989776190943e-33  
🚗   5.1888566027721555e-96  
🗳   2.294988066281684e-52  
👇   9.257689864392038e-303  
🎓   2.524296308827204e-144  

--> vegan =
🤦   3.599614923792856e-232  
😢   4.0794779157928546e-75  
😡   1.127158024820009e-168  
😔   1.0662844337858481e-198  
😋   6.122445647588763e-110  

--> earth =
🤟   2.028946692897737e-36  
🐋   1.0  
🏟   1.3189489143397244e-51  
🌏   7.057320726225616e-287  

--> greta =
🧹   3.2315936025548407e-40  
🧡   5.134160423670935e-69  
🧐   3.7119745947176606e-35  
🦓   0.11111088111755894  
🦋   9.340148717550855e-83  

--> korea =
🕺   8.480675382235633e-109  
💛   1.8814951513477593e-209  
💔   1.0805641867324546e-120  
👑   1.0  
🎉   4.2086070926140675e-306  

--> abortion =
🤬   2.6287289042468105e-21  
🤝   4.341096738792882e-10  
🤒   8.491163585538909e-29  
😳   1.473566594080

🏌  --> ['learn' 'mind' 'open' 'thing' 'emison eqqc7reh7m']
🏎  --> ['attention' 'woah' 'use' 'real' 'race day']
🏏  --> ['national' 'indvaus' 'woman' 'team' 'unbelievable']
🏐  --> ['intensity' 'talk intensity' 'bepartofthegame' 'intensity portugal'
 'ko2zm2ahyq bepartofthegame']
🏑  --> ['𝐌𝐀𝐓𝐂𝐇 computer' 'elland' 'elli' 'elli tom' 'ellis']
🏒  --> ['sweep' 'finals' 'stanleycup' 'head' 'let']
🏝  --> ['disaster cycl' 'hurricanes wildfire' 'extreme weather' 'extreme' 'flood']
🏞  --> ['ready' 'world' '𝐌𝐀𝐓𝐂𝐇 computer' 'emison eqqc7reh7m' 'engineer']
🏟  --> ['26' 'la' 'memory' 'rose' 'throw']
🏥  --> ['al' 'fight' 'seven' 'continue' 'announce']
🏫  --> ['later' 'month' 'sorry' 'class' 'late']
🏳  --> ['flag' 'amazing' 'country reply' 'reply flag' 'right']
🏴  --> ['country' 'best' 'country reply' 'reply flag' 'el']
🏹  --> ['team usa' 'champion' 'team' 'usa' 'world']
🐇  --> ['bed' 'late' 'come' 'day' 'want']
🐋  --> ['ocean' 'earth' 'today' 'leviathan' 'track monster']
🐎  --> ['black' '63' 'july' 'par

🙏  --> ['thank' 'soul' 'family' 'great' 'country']
🚀  --> ['inside' 'james' 'future' 'meet' 'make']
🚂  --> ['board' 'usa' 'john' 'james' 'support']
🚑  --> ['fb' '𝐌𝐀𝐓𝐂𝐇 computer' 'emison eqqc7reh7m' 'engineer khatputli' 'engineer']
🚒  --> ['age' 'oh wait' 'oh' 'wait' '15']
🚓  --> ['darrell' 'want darrell' 'want' '𝐌𝐀𝐓𝐂𝐇 computer' 'emison']
🚔  --> ['darrell' 'want darrell' 'want' '𝐌𝐀𝐓𝐂𝐇 computer' 'emison']
🚗  --> ['receive' 'al' 'drive' 'line' 'state']
🚜  --> ['america' 'great' '𝐌𝐀𝐓𝐂𝐇 computer' 'emison eqqc7reh7m' 'energy drinks']
🚨  --> ['help' 'great' 'tow' 'emergency' 'reach']
🚩  --> ['follower' '10 20' '20 30' '40 50' '50 60']
🚫  --> ['sex' 'support' 'thing' 'right' 'time']
🚮  --> ['guess' 'hand' 'news' 'good' 'emergency']
🚱  --> ['disaster cycl' 'hurricanes wildfire' 'extreme weather' 'extreme' 'flood']
🚲  --> ['senior' '𝐌𝐀𝐓𝐂𝐇 computer' 'engineer khatputli' 'elland road' 'elli']
🚶  --> ['walk' 'man' '𝐌𝐀𝐓𝐂𝐇 computer' 'emotional' 'engineer']
🚾  --> ['mf' 'album' 'ready' '𝐌𝐀𝐓𝐂𝐇 computer