This notebook contains code that performs a dynamic sentiment analysis based on emojis.

Emojis are extracted from tweets, and the tweet text is used to predict the emoji.

In [1]:
def get_full_text(tweet):
    if 'extended_tweet' in tweet:
        return tweet['extended_tweet']['full_text']
    else:
        return tweet['text']
    
def clean_up_text(text):
    text = text.replace('\n','').replace('\r','').replace('\t','')
    while '  ' in text:
        text = text.replace('  ',' ')
    if text == '': return ''
    if text[0] == ' ':
        text = text[1:]
    if text == '': return ''
    if text[-1] == ' ':
        text = text[:-1]
    return text
        
def get_orig_text(tweet):
    if 'retweeted_status' in tweet:
        return get_orig_text(tweet['retweeted_status'])
    else:
        return clean_up_text(get_full_text(tweet))

In [2]:
import glob
import json

raw_tweets = []
i = 0
directory = [f for f in glob.iglob('Downloads/BackLAOut/*')]
nontest = (2*len(directory))//3
for filepath in directory[:nontest]:
    i += 1
    if i % (nontest//100) == 0:
        print(str(i)+' of '+str(nontest)+' files read, '+
              str(len(raw_tweets))+' total tweets')
    file = open(filepath, 'r')
    for line in file:
        tweet = json.loads(line)
        text = get_orig_text(tweet)
        if ('http://' not in text) and ('https://' not in text):
            raw_tweets.append(text)
    file.close()

183 of 18386 files read, 65325 total tweets
366 of 18386 files read, 129538 total tweets
549 of 18386 files read, 193345 total tweets
732 of 18386 files read, 256653 total tweets
915 of 18386 files read, 319837 total tweets
1098 of 18386 files read, 383943 total tweets
1281 of 18386 files read, 446930 total tweets
1464 of 18386 files read, 510555 total tweets
1647 of 18386 files read, 574677 total tweets
1830 of 18386 files read, 637719 total tweets
2013 of 18386 files read, 700754 total tweets
2196 of 18386 files read, 766050 total tweets
2379 of 18386 files read, 828636 total tweets
2562 of 18386 files read, 892612 total tweets
2745 of 18386 files read, 955234 total tweets
2928 of 18386 files read, 1017960 total tweets
3111 of 18386 files read, 1082883 total tweets
3294 of 18386 files read, 1147382 total tweets
3477 of 18386 files read, 1211147 total tweets
3660 of 18386 files read, 1275046 total tweets
3843 of 18386 files read, 1339239 total tweets
4026 of 18386 files read, 1404534 

KeyboardInterrupt: 

In [23]:
raw_tweets = list(set(raw_tweets))
len(raw_tweets),raw_tweets

(1842492,
 ['@anxhg im down 🤣🤣🤣',
  '@TeaProvider Wow that’s adorable',
  '@Almighty_Doza @its___greg @MiguelMvrtinez There’s a line.... you crossed it',
  '@Weeewong Lhig kamandag ang mga diablo run hahahaha',
  '@Cowboy_Kemani Me and 4 hgs.',
  '@ymercado @DogsandComics Ahhh so cute!!',
  '@funder I think he’s wearing make up. Even around his eyes. Plus, today he’s wearing blush.',
  'I’ve been inside so long that my doctor prescribed me to get more sun lmao',
  'Ok so while I’m here, my rankings:1. The Makings of Me2. Still Standing3. After The Storm4. The Boy is Mine5. New Life',
  'It’s “I can feel my love handles getting bigger” o’clock, over here. What time is it over there?',
  '@KodaiAIO @KodaiSuccess I definitely need Kodai in my life! this is such a hectic time and I need to find a way to make money 😭',
  '@dustin_SFA This is silly 🙃',
  '@marsqo I’m ya #1 👊🏽',
  '@GUCCIERREZ_ BRUH I’ve been waking up late tooooo. I sleep around 8-9am then wake up at 5pm. Always it’s horribl

In [24]:
joined_tweets = ' '.join(raw_tweets)

In [25]:
from collections import Counter, OrderedDict

raw_word_counts = Counter()
raw_word_list = joined_tweets.split(' ')
for word in raw_word_list:
    raw_word_counts[word] += 1

OrderedDict(sorted(raw_word_counts.items(), key=lambda x: x[1], reverse=True))

OrderedDict([('I', 329943),
             ('the', 307292),
             ('to', 281165),
             ('a', 252480),
             ('you', 170033),
             ('and', 168174),
             ('is', 160553),
             ('my', 138158),
             ('of', 130364),
             ('in', 121188),
             ('for', 116488),
             ('me', 101568),
             ('it', 101061),
             ('this', 98609),
             ('that', 93463),
             ('on', 92598),
             ('be', 82103),
             ('i', 75077),
             ('so', 70457),
             ('I’m', 66268),
             ('just', 64083),
             ('like', 63433),
             ('are', 62457),
             ('have', 61549),
             ('with', 58869),
             ('was', 56072),
             ('not', 54080),
             ('but', 53253),
             ('all', 52997),
             ('your', 49882),
             ('do', 47605),
             ('at', 47098),
             ('get', 45402),
             ('up', 42742),
             

In [26]:
raw_char_counts = Counter()
raw_char_list = [char for char in joined_tweets]
for char in raw_word_list:
    raw_char_counts[char] += 1

OrderedDict(sorted(raw_char_counts.items(), key=lambda x: x[1], reverse=True))

OrderedDict([('I', 329943),
             ('the', 307292),
             ('to', 281165),
             ('a', 252480),
             ('you', 170033),
             ('and', 168174),
             ('is', 160553),
             ('my', 138158),
             ('of', 130364),
             ('in', 121188),
             ('for', 116488),
             ('me', 101568),
             ('it', 101061),
             ('this', 98609),
             ('that', 93463),
             ('on', 92598),
             ('be', 82103),
             ('i', 75077),
             ('so', 70457),
             ('I’m', 66268),
             ('just', 64083),
             ('like', 63433),
             ('are', 62457),
             ('have', 61549),
             ('with', 58869),
             ('was', 56072),
             ('not', 54080),
             ('but', 53253),
             ('all', 52997),
             ('your', 49882),
             ('do', 47605),
             ('at', 47098),
             ('get', 45402),
             ('up', 42742),
             

In [27]:
raw_emoji_counts = {}
for char in raw_char_counts:
    if len(char) == 1:
        if ord(char) > 99999:
            raw_emoji_counts[char] = raw_char_counts[char]

OrderedDict(sorted(raw_emoji_counts.items(), key=lambda x: x[1], reverse=True))

OrderedDict([('😂', 25071),
             ('😭', 11780),
             ('🥺', 11377),
             ('😔', 6355),
             ('🤣', 5840),
             ('🥴', 4285),
             ('👀', 3920),
             ('😩', 3853),
             ('😍', 3441),
             ('🥰', 3400),
             ('🙄', 3355),
             ('🤔', 3294),
             ('😳', 2449),
             ('💀', 2436),
             ('🙃', 2387),
             ('😅', 2288),
             ('💯', 2157),
             ('🙏', 2068),
             ('😊', 2064),
             ('🔥', 2024),
             ('💕', 1860),
             ('😒', 1833),
             ('😘', 1792),
             ('😌', 1764),
             ('😢', 1669),
             ('😤', 1581),
             ('😉', 1504),
             ('😞', 1493),
             ('😁', 1452),
             ('🤪', 1343),
             ('😪', 1315),
             ('😎', 1277),
             ('👍', 1201),
             ('😫', 1196),
             ('😬', 1194),
             ('😏', 1193),
             ('😐', 1192),
             ('💙', 1160),
         

In [28]:
cleaned_tweets = []
emoji_counts = Counter()
i = 0
for i, origtweet in enumerate(raw_tweets):
    i += 1
    if i % (len(raw_tweets)//10) == 0:
        print(str(10*i/(len(raw_tweets)//10))+'% read')
        
    emojis = []
    tweet = ''
    no_emojis = True
    for emoji in raw_emoji_counts:
        if emoji in origtweet:
            no_emojis = False
            for char in origtweet:
                if char == emoji:
                    tweet += ' '+char+' ' # this is so we can parse out the emoji

                    emojis.append(emoji)
                else:
                    tweet += char
    if no_emojis:
        continue
                
    tweet_cleaned = []
    for word in clean_up_text(tweet).split(' '):
        if word[0] != '@':
            tweet_cleaned.append(word)
    tweet_cleaned = ' '.join(tweet_cleaned)
    
    cleaned_tweets.append(tweet_cleaned)
    
    for emoji in set(emojis):
        emoji_counts[emoji] += 1
        
cleaned_tweets = list(set(cleaned_tweets))

cleaned_tweets, OrderedDict(sorted(emoji_counts.items(), key=lambda x: x[1], reverse=True))

10.0% read
20.0% read
30.0% read
40.0% read
50.0% read
60.0% read
70.0% read
80.0% read
90.0% read
100.0% read


(['Blue* eyes 😂 😂',
  'Those man boobs 🥵😍 🥴 Those man boobs 🥵 😍 🥴@ChanceOfShade Those man boobs 🥵 😍🥴',
  'be careful stay safe mijo 🙏 🏼',
  '今日GT3勝するっす。あ、明日カラオケ行きやす 🎙',
  'I wanna have my cake and eat it too 😏',
  'Bom dia sextão de paz 🏳 ️',
  'Hope every single one of them has a speedy recovery 🥺',
  'Ok! I’m down 😛',
  'i still can’t believe pied piper is a real song. CHILL WID DAT!!!! 🥴 🥴',
  'I’m trying.. 🤣 🤣',
  'aa obrigada 💙',
  'Another break in. Now they leaving stolen vehicles inside the garage??? 🤬',
  '#TheCoronaChroniclesBesides for working on my creative projects, all I do is eat snacks and day drink 😒 #IJustWantToGoBackToWork',
  'What kind of flowers do you have? 😅',
  'Almost 1 year 💔🏁 💙 Almost 1 year 💔 🏁💙Almost 1 year 💔 🏁 💙',
  'Trump claims theirs no gap and he’s “hearing good things on the ground”, he should go to hospitals and see if they are saying the same 🤔',
  'I can’t wait till I’m breaking colts and patterning them on the barrels with my future husband who’s

In [44]:
import numpy as np

def get_most_frequent_emojis(d,n):
    a = np.array(list(d.values()))
    inds = np.argpartition(a,-n)
    d = dict(np.array(list(d.items()))[inds[-n:]])
    for k in d:
        d[k] = 1 / int(d[k])
    return d

n_classes = 5
most_frequent_emojis = get_most_frequent_emojis(emoji_counts, n_classes)
most_frequent_emojis

{'🙏': 7.142346975216056e-05,
 '🥺': 3.6199095022624436e-05,
 '😂': 1.5176809834572772e-05,
 '😭': 2.9774310724706724e-05,
 '🤣': 5.338173277104575e-05}

In [45]:
reduced_tweets = []
_X = []
_y = []
not_emojis = 0
for i, tweet in enumerate(cleaned_tweets):
    emojis = []
    not_emoji = False
    for emoji in most_frequent_emojis:
        if 'not '+emoji in tweet:
            not_emoji = True
    if not_emoji:
        not_emojis += 1
        continue
    for word in tweet.split(' '):
        for emoji in most_frequent_emojis:
            if word == emoji:
                emojis.append(emoji)
    # only include tweets that have exactly one of the selected emojis
    if len(set(emojis)) == 1:
        reduced_tweets.append(tweet)
        tweet_stripped = clean_up_text(tweet.replace(emojis[0],'')) # strip out the emoji
        _X.append(tweet_stripped)
        _y.append(emojis)
not_emojis, reduced_tweets[:3], _X[:3]

(['Blue* eyes 😂 😂',
  'be careful stay safe mijo 🙏 🏼',
  'Hope every single one of them has a speedy recovery 🥺'],
 ['Blue* eyes',
  'be careful stay safe mijo 🏼',
  'Hope every single one of them has a speedy recovery'])

In [60]:
# get word counts and vocabulary size
words_list = ' '.join(_X).split(' ')
word_counts = Counter()
for word in words_list:
    word_counts[word] += 1
    
word_counts = OrderedDict(sorted(word_counts.items()), key=lambda x: x[1], reverse=True)
    
vocabulary_size = len(word_counts)
vocabulary_size, word_counts

(147052,
 OrderedDict([('', 116),
              ('!', 1019),
              ('!!', 344),
              ('!!!', 261),
              ('!!!!', 77),
              ('!!!!!', 20),
              ('!!!!!!', 7),
              ('!!!!!!!', 6),
              ('!!!!!!!!', 5),
              ('!!!!!!!!!', 2),
              ('!!!!!!!!!!', 3),
              ('!!!!!!!!!!!!!!!!!!!!!!!', 2),
              ('!!!!!!!!!!!🖕Please😩StayYour😖Dumb😠Ass😖HOME🤬!!!!!!!!!!!', 1),
              ('!!!!!!!!!!🏽🏽', 1),
              ('!!!!!!.', 1),
              ('!!!!@PeterAskin1', 1),
              ('!!!!@liashrminie', 1),
              ('!!!!LA', 1),
              ('!!!!✨', 1),
              ('!!!!🍀🍀🍀🍀🍀🍀🍀🍀🍀🍀', 1),
              ('!!!!🍀🍀🍀🍀🍀🍀🍀🍀🍀🍀😎❣@DanneelHarris', 1),
              ('!!!!🏿', 1),
              ('!!!!😡', 1),
              ('!!!!😥', 1),
              ('!!!!😲😲😲😲', 1),
              ('!!!?', 2),
              ('!!!??', 1),
              ('!!!@RepAdamSchiff', 2),
              ('!!!@ariannavargas_', 1),
         

In [68]:
from keras.preprocessing import sequence

word2id = {}
for i,word in enumerate(word_counts):
    word2id[word] = i

X = [[word2id[word] for word in tweet.split(' ')] for tweet in _X]

max_words = 140
X = sequence.pad_sequences(X, maxlen=max_words)

emoji2id = {}
for i,emoji in enumerate(most_frequent_emojis):
    emoji2id[emoji] = i
    
y = [[emoji2id[emoji[0]]] for emoji in _y]

word2id[''], len(X)

(0, 124582)

This concludes the data grooming. Run the analysis!

In [69]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense

embedding_size=100
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(200, return_sequences=True))
model.add(LSTM(200, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(n_classes, activation='softmax'))

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 140, 100)          14705200  
_________________________________________________________________
lstm_21 (LSTM)               (None, 140, 100)          80400     
_________________________________________________________________
lstm_22 (LSTM)               (None, 140, 200)          240800    
_________________________________________________________________
lstm_23 (LSTM)               (None, 140, 200)          320800    
_________________________________________________________________
lstm_24 (LSTM)               (None, 100)               120400    
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 505       
Total params: 15,468,105
Trainable params: 15,468,105
Non-trainable params: 0
__________________________________________

In [70]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [71]:
from keras.utils import to_categorical

def convert_to_one_hot(_y):
    #return enc.transform(np.array(_y).reshape(-1,1)).toarray()
    return to_categorical(_y, num_classes=n_classes)

percent_validation = 0.2
batch_size = 64
valid_size = len(X)-batch_size*int((1-percent_validation)*len(X)/batch_size)
X_valid, y_valid = X[:valid_size], convert_to_one_hot(y[:valid_size])
X_train, y_train = X[valid_size:], convert_to_one_hot(y[valid_size:])
len(X_train),len(X_valid),len(y_train),len(y_valid),X_valid[0], y_valid[0]

(99648,
 24934,
 99648,
 24934,
 array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   

In [72]:
class_weight = np.array([[emoji2id[k],v] for k,v in most_frequent_emojis.items()])
class_weight[:,1] /= np.linalg.norm(class_weight[:,1],1)
class_weight = dict(class_weight)
class_weight

{0.0: 0.3467909240227382,
 1.0: 0.17576180008117132,
 2.0: 0.07368978186739046,
 3.0: 0.14456677565778472,
 4.0: 0.25919071837091534}

In [73]:
num_epochs = 10
model.fit(X_train, y_train, validation_data=(X_valid, y_valid),
          batch_size=batch_size, epochs=num_epochs, class_weight=class_weight)



Train on 99648 samples, validate on 24934 samples
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [None]:
id2emoji = {i:emoji for emoji,i in emoji2id.items()}
id2word = {i:word for word,i in word2id.items()}
id2word

In [None]:
for i in range(100):
    j = np.argmax(model.predict(np.array(X[i])[np.newaxis]))
    print(id2emoji[j], reduced_tweets[i])