#Load Library 

In [2]:
## import all necessary packages
import json
import re

import nltk, string
from nltk.collocations import *

import numpy as np
import matplotlib.pyplot as plt

from pandas import *

from collections import defaultdict

from happyfuntokenizing import Tokenizer

import twitterclean as tc

#Load Data

In [3]:
with open('./data/tweets_1M.json','r') as f:
    tweet_df = DataFrame(json.load(f))

#Clean Data (Handle, URL, Emoticon Conversion)

In [4]:
tweet_df = tc.cleanHandle(tweet_df)
tweet_df = tc.cleanURL(tweet_df)
tweet_df = tc.convertEmoticon(tweet_df)
tweet_df.head()

367031 handles replaced
227446 urls replaced
😀   replaced    209 times
😉   replaced   3358 times
😜   replaced    329 times
😆   replaced      0 times
😎   replaced    111 times
😀   replaced   1533 times
😐   replaced     92 times
💔   replaced      0 times
😞   replaced   5760 times
❤   replaced      0 times
😧   replaced    207 times
😛   replaced   1166 times
🐵   replaced      7 times
😠   replaced      0 times
😃   replaced  15658 times
😮   replaced    357 times
😢   replaced    339 times
💋   replaced    287 times
😕   replaced   7435 times
ALL replaced 36848 times


Unnamed: 0,id,lat,lng,text,timeStamp,user_id
0,377652254096228352,37.4461,-121.883557,hdl hey checkout the website: url,Wed Sep 11 04:38:08 +0000 2013,224874450
1,377652255346159616,34.087406,-117.462604,hdl 😪,Wed Sep 11 04:38:08 +0000 2013,312179473
2,377652262325456897,37.356131,-121.842867,i laugh a lot with that line,Wed Sep 11 04:38:10 +0000 2013,54351774
3,377652264682655744,37.364664,-122.009629,sons of anarchy is back on woop woop,Wed Sep 11 04:38:11 +0000 2013,343219606
4,377652271116722176,37.3826,-121.995,Drinking a Fresh Squeezed IPA by hdl @ St. John's Bar &amp; Grill — url,Wed Sep 11 04:38:12 +0000 2013,1569395935


#Emoji Finder

In [5]:
try:
    # Wide UCS-4 build
    highpoints = re.compile(u'['
        u'\U0001F300-\U0001F64F'
        u'\U0001F680-\U0001F6FF'
        u'\u2600-\u26FF\u2700-\u27BF]+', 
        re.UNICODE)
except re.error:
    # Narrow UCS-2 build
    highpoints = re.compile(u'('
        u'\ud83c[\udf00-\udfff]|'
        u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
        u'[\u2600-\u26FF\u2700-\u27BF])+', 
        re.UNICODE)

# Subset Dataframe with Emojis Only (New Dataframe emoij_df)

In [6]:
emoji_list = []
for index, value in enumerate(tweet_df.text):
    if highpoints.search(value):
        emoji_list.append((index, value))
emoji_index = [x[0] for x in emoji_list]
emoji_df = tweet_df.ix[emoji_index]

There's a user id so technically we could identify sets of tweets by users. 

** first stab at counting frequency of different emojis **

##Add a new column that display just the emojis for the text

In [106]:
def emojiExtract(sent):
    return [word for word in tok.tokenize(sent) if is_emoji(word) == 1]

def textExtract(sent):
    return [word for word in tok.tokenize(sent) if is_emoji(word) == 0]

def addEmojiCol(df):
    df['Emoji'] = [emojiExtract(word) for word in df.text]

def addText(df):
    df['only_Text'] = [textExtract(word) for word in df.text]
    
addEmojiCol(tweet_df)
addText(tweet_df)
tweet_df[:10]

Unnamed: 0,id,lat,lng,text,timeStamp,user_id,Emoji,only_Text
0,377652254096228352,37.4461,-121.883557,hdl hey checkout the website: url,Wed Sep 11 04:38:08 +0000 2013,224874450,[],"[hdl, hey, checkout, the, website, :, url]"
1,377652255346159616,34.087406,-117.462604,hdl 😪,Wed Sep 11 04:38:08 +0000 2013,312179473,[😪],[hdl]
2,377652262325456897,37.356131,-121.842867,i laugh a lot with that line,Wed Sep 11 04:38:10 +0000 2013,54351774,[],"[i, laugh, a, lot, with, that, line]"
3,377652264682655744,37.364664,-122.009629,sons of anarchy is back on woop woop,Wed Sep 11 04:38:11 +0000 2013,343219606,[],"[sons, of, anarchy, is, back, on, woop, woop]"
4,377652271116722176,37.3826,-121.995,Drinking a Fresh Squeezed IPA by hdl @ St. John's Bar &amp; Grill — url,Wed Sep 11 04:38:12 +0000 2013,1569395935,[],"[drinking, a, fresh, squeezed, ipa, by, hdl, @, st, ., john's, bar, &, amp, ;, grill, —, url]"
5,377652275147444224,37.756149,-122.152813,I have 8 am classes this quarter ... I need to get this sleep thing together,Wed Sep 11 04:38:13 +0000 2013,399164195,[],"[i, have, 8, am, classes, this, quarter, ..., i, need, to, get, this, sleep, thing, together]"
6,377652275885654016,38.402054,-121.476074,Why is Milgrim's eyes always red af..?,Wed Sep 11 04:38:13 +0000 2013,170950783,[],"[why, is, milgrim's, eyes, always, red, af, .., ?]"
7,377652281480843264,34.080237,-118.39016,"#FightOn! 😛 MT hdl : Eh, you're down south. Lol RT “ hdl : I'm here for a few days! 😃 hdl hdl ”",Wed Sep 11 04:38:15 +0000 2013,105560657,"[😛, 😃]","[#fighton, !, mt, hdl, :, eh, ,, you're, down, south, ., lol, rt, “, hdl, :, i'm, here, for, a, few, days, !, hdl, hdl, ”]"
8,377652282021933056,37.477945,-122.227526,"Asked him what he did today and got a response like this ""I was being black"" daaaa fuck, #DumbBitch #GoAwayDen",Wed Sep 11 04:38:15 +0000 2013,330249663,[],"[asked, him, what, he, did, today, and, got, a, response, like, this, "", i, was, being, black, "", daaaa, fuck, ,, #dumbbitch, #goawayden]"
9,377652282844012544,37.274621,-121.742097,hdl my moms just nosey lol,Wed Sep 11 04:38:15 +0000 2013,46575217,[],"[hdl, my, moms, just, nosey, lol]"


# Count Emoji Per Text (1.12 Average Emoji/Text, 53.2 Average Text Length)

In [7]:
# List of functions for emoji search
faces = re.compile(u'['
        u'\U0001F600-\U0001F64F]',
        re.UNICODE)
# Function that takes a list of text and return text that contains just faces
def just_face(text):
    return (faces.findall(text))
# Function that take a list of text and return text with just emojis
def just_emojis(text):
    return (highpoints.findall(text))
# Function that a list of text and return the number of emojis in the text.
def count_emojis(text):
    return len(highpoints.findall(text))

In [8]:
emoji_df["Emoji Count"] = emoji_df["text"].apply(count_emojis)
emoji_df["Text Length"] = emoji_df["text"].apply(lambda x: len(x))
emoji_df = emoji_df[['id', 'text', 'timeStamp', 'user_id', 'Emoji Count', 'Text Length']]
emoji_df.describe()

Unnamed: 0,id,user_id,Emoji Count,Text Length
count,208252.0,208252.0,208252.0,208252.0
mean,3.822541e+17,519865100.0,1.122659,53.161506
std,2378889000000000.0,461205000.0,0.603226,33.193435
min,3.776523e+17,21.0,1.0,1.0
25%,3.801828e+17,149946000.0,1.0,27.0
50%,3.823091e+17,385250900.0,1.0,46.0
75%,3.843753e+17,754669700.0,1.0,75.0
max,3.863382e+17,1934553000.0,70.0,214.0


# Emoji Distribution

In [None]:
reset_df = emoji_df.reset_index(drop=True)
emoji_array = [reset_df.loc[[index]].text.apply(just_emojis) for index in range(len(reset_df))]
full_list = []
for item in emoji_array:
    for emoji in item:
        for sinlge in emoji:
            full_list.append(sinlge)

In [9]:
full_dict = defaultdict(int)
for item in full_list:
    full_dict[item] += 1

### 30469 Unique Emoji 

In [10]:
len(unique(full_dict.keys()))

30469

In [11]:
for item in sorted(full_dict.items(), key=lambda x:x[1], reverse=True)[:50]:
    print(item[0], (float(item[1])/len(full_list))*100)

😃 7.055753972497274
😕 3.5458414422891846
😂 3.268247823948331
❤ 3.138219380226267
😞 2.920935007164396
😍 2.202784490686285
😉 2.0162963279796404
😒 1.7801920485895764
😏 1.5299728394533674
☺ 1.5060202313993027
👌 1.4666695181676255
😔 1.4251801792168353
😊 1.3854017408413353
😭 1.3700036356637226
😘 1.2900190337689001
😂😂😂 1.219016659894352
😂😂 1.1702559934985777
😳 1.0992536196240295
😁 1.093693192754336
😩 1.007292713702175
💕 0.8862464980003849
😀 0.8242263521461108
🙌 0.7378258730939499
✌ 0.7245663936354498
💁 0.6894929318419983
♫ 0.628755961419192
😛 0.6270450608439018
😋 0.6210569088303856
😌 0.6210569088303856
😑 0.6005261019269017
🎶 0.5923993241942728
🙏 0.5530486109625954
😎 0.5513377103873052
😂😂😂😂 0.5393614063602729
😜 0.4474005004384183
♥ 0.4388459975619667
👍 0.4379905472743215
😢 0.4354241964113861
👏 0.4277251438225796
😐 0.40505571119998296
😍😍😍 0.3605722962424346
😄 0.34218011505806367
♡ 0.3400414893389508
🙈 0.3246433841613379
💋 0.32164930815457987
😴 0.3036848521140315
😍😍 0.30069077610727346
✋ 0.29384

# Emoji Face Distribution

In [12]:
face_array = [reset_df.loc[[index]].text.apply(just_face) for index in range(len(reset_df))]

face_list = []
for item in face_array:
    for emoji in item:
        for sinlge in emoji:
            face_list.append(sinlge)

print("number of tweets with emojis {0}".format(len(full_list)))
print("number of tweets with faces  {0}".format(len(face_array)))
print("percentage of tweets with emojis with faces {0}%".format(round((float(len(face_array))/len(full_list)*100),1)))

number of tweets with emojis 233795
number of tweets with faces  208252
percentage of tweets with emojis with faces 89.1%


In [13]:
face_dict = defaultdict(int)
for item in face_list:
    face_dict[item] += 1

In [14]:
for item in sorted(face_dict.items(), key=lambda x:x[1], reverse=True)[:50]:
    print(item[0], (float(item[1])/len(full_list))*100)

😂 21.143737034581576
😍 8.991210248294445
😭 7.4484056545264
😃 7.385957783528305
😘 4.1878568831668765
😕 3.733185055283475
😞 3.2336020872987015
😏 3.092025064693428
😊 2.989371030176009
😒 2.965846147265767
😩 2.8405226801257513
😁 2.748134049060074
😉 2.5116020445261875
😔 2.3631814196197523
🙌 2.2836245428687523
😳 2.1458970465578817
🙏 1.4585427404349964
😋 1.3712868110951901
😑 1.1758164203682713
😌 1.1621292157659489
😎 1.1240616779657393
😡 1.0466434269338525
😴 0.9739301524840137
🙈 0.967514275326675
😀 0.9516884450052396
😱 0.9178981586432557
😫 0.8888128488633203
😛 0.872987018541885
😜 0.8250818024337562
😄 0.8118223229752561
😢 0.7861588143459013
😷 0.7630616565794821
🙊 0.7442417502512886
😐 0.6099360550909985
😤 0.5184028743129665
😈 0.5012938685600633
😪 0.494450266258902
😝 0.48332941251951494
😖 0.4444064244316602
😣 0.44312324900019245
🙋 0.43328557069227314
😅 0.4221647169528861
😬 0.3840971791526765
😓 0.3550118693727411
😠 0.3537286939412734
🙅 0.34731281678393466
😆 0.335764237900725
😻 0.32378793387369276
😰

# Exploring Emoji vs no Emoji (20% Text has Emojis, 15% has face)

###Functions to check whether it's emoji or face

In [30]:
# Functions to check whether there's an emoji in the text, return 1 if true, 0 if false
def is_emoji(text):
    if highpoints.search(text):
        return 1
    else:
        return 0
# Functions to check whether there's a face emoji in the text, return 1 if true, 0 if false
def is_face(text):
    if faces.search(text):
        return 1
    else:
        return 0

In [22]:
tweet_df["is_emoji"] = tweet_df.text.apply(is_emoji)
tweet_df["is_face"] = tweet_df.text.apply(is_face)
tweet_df.head()

Unnamed: 0,id,lat,lng,text,timeStamp,user_id,is_emoji,is_face
0,377652254096228352,37.4461,-121.883557,hdl hey checkout the website: url,Wed Sep 11 04:38:08 +0000 2013,224874450,0,0
1,377652255346159616,34.087406,-117.462604,hdl 😪,Wed Sep 11 04:38:08 +0000 2013,312179473,1,1
2,377652262325456897,37.356131,-121.842867,i laugh a lot with that line,Wed Sep 11 04:38:10 +0000 2013,54351774,0,0
3,377652264682655744,37.364664,-122.009629,sons of anarchy is back on woop woop,Wed Sep 11 04:38:11 +0000 2013,343219606,0,0
4,377652271116722176,37.3826,-121.995,Drinking a Fresh Squeezed IPA by hdl @ St. John's Bar &amp; Grill — url,Wed Sep 11 04:38:12 +0000 2013,1569395935,0,0


In [25]:
tweet_df.describe()

Unnamed: 0,id,lat,lng,user_id,is_emoji,is_face
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,3.822416e+17,37.24198,-121.53343,4961052000.0,0.208252,0.154146
std,2370783000000000.0,1.333404,1.542418,4503597000000.0,0.406058,0.361089
min,3.776523e+17,12.9833,-170.296751,12.0,0.0,0.0
25%,3.801867e+17,37.320896,-122.350934,73821100.0,0.0,0.0
50%,3.822777e+17,37.656377,-122.043733,318878700.0,0.0,0.0
75%,3.843437e+17,37.790622,-121.785433,622968600.0,0.0,0.0
max,3.887526e+17,57.17012,77.5833,4503597000000000.0,1.0,1.0


##Take a look at the imported tokenizer and test it

In [23]:
tok = Tokenizer(preserve_case=False)
samples = (
    u"RT @ #happyfuncoding: this is a typical Twitter tweet😖",
    u"😂😂😂 RT @Yours_Truly3x: Bitch brush yoo mouth; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
    u"Yay my cat is cuddling🔫 with me tonight❤ +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace."
    )

In [24]:
for s in samples:
        print("======================================================================")
        print(s)
        tokenized = tok.tokenize(s)
        print(list(tokenized))
        print("\n")

RT @ #happyfuncoding: this is a typical Twitter tweet😖
['rt', '@', '#happyfuncoding', ':', 'this', 'is', 'a', 'typical', 'twitter', 'tweet', '😖']


😂😂😂 RT @Yours_Truly3x: Bitch brush yoo mouth; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(
['😂', '😂', '😂', 'rt', '@yours_truly3x', ':', 'bitch', 'brush', 'yoo', 'mouth', ';', 'other', 'web', 'oddities', 'can', 'be', 'an', '&', 'aacute', ';', 'cute', "<em class='grumpy'>", 'pain', '</em>', '>:(']


Yay my cat is cuddling🔫 with me tonight❤ +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.
['yay', 'my', 'cat', 'is', 'cuddling', '🔫', 'with', 'me', 'tonight', '❤', '+1 (800) 123-4567', ',', '(800) 123-4567', ',', 'and', '123-4567', 'are', 'treated', 'as', 'words', 'despite', 'their', 'whitespace', '.']




# Unigrams Frequency

In [21]:
tok = Tokenizer(preserve_case=False)
text_list = list(tweet_df.text)
tokenized = [list(tok.tokenize(item)) for item in text_list]
print(tokenized[:10])

[['hdl', 'hey', 'checkout', 'the', 'website', ':', 'url'], ['hdl', '😪'], ['i', 'laugh', 'a', 'lot', 'with', 'that', 'line'], ['sons', 'of', 'anarchy', 'is', 'back', 'on', 'woop', 'woop'], ['drinking', 'a', 'fresh', 'squeezed', 'ipa', 'by', 'hdl', '@', 'st', '.', "john's", 'bar', '&', 'amp', ';', 'grill', '—', 'url'], ['i', 'have', '8', 'am', 'classes', 'this', 'quarter', '...', 'i', 'need', 'to', 'get', 'this', 'sleep', 'thing', 'together'], ['why', 'is', "milgrim's", 'eyes', 'always', 'red', 'af', '..', '?'], ['#fighton', '!', '😛', 'mt', 'hdl', ':', 'eh', ',', "you're", 'down', 'south', '.', 'lol', 'rt', '“', 'hdl', ':', "i'm", 'here', 'for', 'a', 'few', 'days', '!', '😃', 'hdl', 'hdl', '”'], ['asked', 'him', 'what', 'he', 'did', 'today', 'and', 'got', 'a', 'response', 'like', 'this', '"', 'i', 'was', 'being', 'black', '"', 'daaaa', 'fuck', ',', '#dumbbitch', '#goawayden'], ['hdl', 'my', 'moms', 'just', 'nosey', 'lol']]


In [31]:
stop_words = nltk.corpus.stopwords.words('english') + ["http", 'hdl', 'url'] 
punctuation_words = list(set(string.punctuation)) + [":", ":/"]

def real_unigrams(text):
    real_unigrams = [word for sent in text for word in sent if word.lower() not in stop_words and word not in punctuation_words and is_emoji(word) == 1] 
    real_unigrams_freq = nltk.FreqDist(real_unigrams)
    top_unigrams = real_unigrams_freq.most_common(100)
    return top_unigrams

In [32]:
real_unigrams(tokenized)

[('😂', 49433),
 ('😍', 21021),
 ('😭', 17414),
 ('😃', 17272),
 ('❤', 13025),
 ('😘', 9791),
 ('😕', 8727),
 ('👌', 8540),
 ('😞', 7558),
 ('😏', 7229),
 ('😊', 6989),
 ('😒', 6934),
 ('💕', 6650),
 ('😩', 6641),
 ('😁', 6425),
 ('👏', 5917),
 ('😉', 5874),
 ('☺', 5581),
 ('😔', 5525),
 ('🙌', 5339),
 ('😳', 5017),
 ('🎶', 3603),
 ('🙏', 3410),
 ('😋', 3206),
 ('👍', 3179),
 ('💁', 2892),
 ('🎉', 2848),
 ('✌', 2814),
 ('😑', 2749),
 ('😌', 2717),
 ('😎', 2628),
 ('😡', 2447),
 ('😴', 2277),
 ('🙈', 2262),
 ('😀', 2225),
 ('😱', 2146),
 ('💙', 2125),
 ('💛', 2120),
 ('♥', 2101),
 ('😫', 2078),
 ('😛', 2040),
 ('🔫', 2008),
 ('😜', 1929),
 ('😄', 1898),
 ('💋', 1837),
 ('😢', 1834),
 ('😷', 1784),
 ('🙊', 1740),
 ('♡', 1673),
 ('💜', 1597),
 ('💔', 1539),
 ('✋', 1507),
 ('👊', 1484),
 ('♫', 1473),
 ('💚', 1438),
 ('✨', 1430),
 ('😐', 1426),
 ('🔥', 1338),
 ('💗', 1330),
 ('💦', 1266),
 ('💯', 1238),
 ('🏈', 1231),
 ('😤', 1212),
 ('💃', 1207),
 ('💪', 1181),
 ('😈', 1172),
 ('💖', 1165),
 ('😪', 1156),
 ('😝', 1130),
 ('💘', 1049),
 ('😖', 1039),
 

# Bigram Frequency

In [26]:
bigram_tokenized = [list(tok.tokenize(item)) for item in text_list]

In [35]:
def bigrams(text):
    all_bigrams = [nltk.bigrams(sent) for sent in text]
    all_bigrams = [pair for _list in all_bigrams for pair in list(_list) \
                   if pair[0] not in stop_words and pair[1] not in stop_words \
                  and pair[0] not in punctuation_words and pair[1] not in punctuation_words
                  and is_emoji(pair[0]) == 1 and is_emoji(pair[1]) == 1 
                  and pair[0] != pair[1]]
    
    bi_freq = nltk.FreqDist(all_bigrams)
    top_bigrams = bi_freq.most_common(100)
    return top_bigrams

In [36]:
bigrams(bigram_tokenized)

[(('😂', '😭'), 1546),
 (('😭', '😂'), 1089),
 (('😍', '😘'), 638),
 (('😍', '❤'), 418),
 (('😘', '❤'), 397),
 (('😂', '👏'), 386),
 (('💚', '💛'), 339),
 (('😂', '👌'), 322),
 (('🎉', '🎊'), 300),
 (('🎃', '👻'), 298),
 (('😘', '😍'), 287),
 (('😩', '😭'), 287),
 (('😍', '👌'), 261),
 (('😍', '💕'), 249),
 (('😳', '😂'), 233),
 (('😘', '💕'), 233),
 (('🎉', '🎈'), 218),
 (('😍', '😭'), 216),
 (('😭', '😩'), 206),
 (('😏', '👌'), 203),
 (('😭', '😍'), 203),
 (('💛', '💚'), 196),
 (('💙', '💛'), 188),
 (('🎊', '🎉'), 185),
 (('😩', '😂'), 178),
 (('😍', '🙌'), 176),
 (('😏', '😂'), 174),
 (('😭', '💔'), 174),
 (('💛', '❤'), 170),
 (('😭', '❤'), 170),
 (('😊', '😘'), 164),
 (('❤', '😘'), 162),
 (('💛', '💙'), 159),
 (('♡', '♥'), 155),
 (('😫', '😭'), 154),
 (('😂', '😘'), 154),
 (('❤', '💛'), 152),
 (('😊', '❤'), 152),
 (('🎉', '🎁'), 150),
 (('😂', '😩'), 147),
 (('😍', '😋'), 147),
 (('😍', '☺'), 146),
 (('😂', '😍'), 143),
 (('👊', '💢'), 143),
 (('🍃', '🍂'), 142),
 (('🙏', '🙌'), 140),
 (('😩', '😫'), 140),
 (('😊', '👌'), 138),
 (('👍', '👌'), 135),
 (('👌', '👍'), 129)

## Collocations

In [38]:
text_joined = " ".join(tweet_df.text)

' hdl  hey checkout the website:  url   hdl  😪 i laugh a lot with that line sons of anarchy is back o'

In [41]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

def collocation_bigrams(text):    
    finder = BigramCollocationFinder.from_words(text)
    finder.apply_freq_filter(9)
    bigram_coll = (finder.nbest(bigram_measures.pmi, 500))
    return bigram_coll

In [42]:
collocation_bigrams = collocation_bigrams(text_joined)

In [53]:
[pair for pair in collocation_bigrams if is_emoji(pair[0]) == 1 and is_emoji(pair[1]) == 1]

[('🌖', '🌗'),
 ('🌒', '🌓'),
 ('🌗', '🌘'),
 ('🌓', '🌔'),
 ('💷', '💶'),
 ('🎌', '🎌'),
 ('🚋', '🚋'),
 ('🔻', '🔺'),
 ('🔺', '🔻'),
 ('🎏', '🎏'),
 ('🌑', '🌒'),
 ('🚩', '🚩'),
 ('🌕', '🌖'),
 ('❕', '❕'),
 ('🐈', '🐈'),
 ('📢', '📢'),
 ('🍙', '🍘'),
 ('🎡', '🎠'),
 ('📦', '📦'),
 ('💈', '💈'),
 ('🌔', '🌕'),
 ('💺', '💺'),
 ('🎯', '🎯'),
 ('🐖', '🐖'),
 ('🐜', '🐜'),
 ('🍲', '🍢'),
 ('⚪', '⚪'),
 ('🔕', '🔕'),
 ('🎹', '🎹'),
 ('🚽', '🚽'),
 ('🐑', '🐑'),
 ('🔔', '🔔'),
 ('🔴', '🔵'),
 ('🔶', '🔷'),
 ('🎅', '🎄'),
 ('🍅', '🌽'),
 ('🔵', '🔴'),
 ('❓', '❓'),
 ('🍼', '🍼'),
 ('🐔', '🐔'),
 ('🚓', '🚔'),
 ('🐼', '🐼'),
 ('🍍', '🍍'),
 ('🍗', '🍖'),
 ('🏢', '🏢'),
 ('🐸', '🐸'),
 ('🐋', '🐋'),
 ('📲', '📲'),
 ('🐵', '🐵'),
 ('🐽', '🐽'),
 ('🐍', '🐍'),
 ('🔒', '🔒'),
 ('🚣', '🚣'),
 ('🚔', '🚨'),
 ('👣', '👣'),
 ('📓', '📓'),
 ('🍏', '🍊'),
 ('❌', '❌'),
 ('👟', '👞'),
 ('👞', '👟'),
 ('🍱', '🍣'),
 ('👾', '👾'),
 ('🍭', '🍬'),
 ('🍉', '🍉'),
 ('👇', '👇'),
 ('🐠', '🐟'),
 ('🔶', '⚫'),
 ('🍤', '🍤'),
 ('🎄', '🎅'),
 ('🎓', '🎓'),
 ('🐝', '🐝'),
 ('🐵', '🐒'),
 ('🚿', '🚿'),
 ('🍎', '🍏'),
 ('🎢', '🎡'),
 ('🐰', '🐰'),
 ('⚠', '⚠'),

In [None]:
'''
Future Work:
- look at unigram, bigram, collocations for 1) all text, 2) just text, 3) just emojis, 4) just faces
- POS tag and pull out nouns, adjectives, and verbs

'''