This notebook contains code that performs a dynamic sentiment analysis based on emojis.

Emojis are extracted from tweets, and the tweet text is used to predict the emoji.

In [1]:
import glob
import json
import string

def get_full_text(tweet):
    if 'extended_tweet' in tweet:
        return tweet['extended_tweet']['full_text']
    else:
        return tweet['text']
    
def clean_up_text(text):
    text = text.replace('\n','').replace('\r','').replace('\t','')
    while '  ' in text:
        text = text.replace('  ',' ')
    return text.strip()
        
def get_orig_text(tweet):
    if 'retweeted_status' in tweet:
        return get_orig_text(tweet['retweeted_status'])
    else:
        return clean_up_text(get_full_text(tweet))
    
directory = [f for f in glob.iglob('Downloads/BackLAOut/*')]
i = 0
raw_tweets = []
for filepath in directory:
    if len(raw_tweets) > 1e6: break # comment out when running full dataset

    if (i > 0) and (i % (len(directory)//10) == 0):
        print(str(i)+' of '+str(len(directory))+' files read, '+
              str(len(raw_tweets))+' total tweets')
    file = open(filepath, 'r')
    for line in file:
        tweet = json.loads(line)
        text = get_orig_text(tweet)
        if ('http://' not in text) and ('https://' not in text):
            text = text.translate(str.maketrans('', '', string.punctuation))
            raw_tweets.append(text)
    file.close()
    i += 1
raw_tweets[:10]

2757 of 27579 files read, 959771 total tweets


['x00Al13N because you know its so hard to go to the atm to get cash right',
 'Boys who still kiss your hand or your forehead out of nowhere are ANGELS',
 'Might fuck around and sleep for the rest of the day 🥴',
 'My head is constantly spinning of what to do with my life',
 'Tell me your Animal Crossing Island Name and Ill give you something to watch',
 'I miss going to target for no reason',
 'Brooklyne Thanks friend',
 'davidfrum Cool Now do Iraq Lives Matter Dave',
 'jzimmermann11 CervezaTeresa Hey Ill be there',
 'Won’t stop til I win']

In [2]:
raw_tweets = list(set(raw_tweets))
joined_tweets = ' '.join(raw_tweets)

len(raw_tweets)

574812

In [3]:
from collections import Counter, OrderedDict

raw_word_counts = Counter()
raw_word_list = joined_tweets.split(' ')
for word in raw_word_list:
    raw_word_counts[word] += 1
    
def display_top_10_counts(d):
    odct = OrderedDict(sorted(d.items(), key=lambda x: x[1], reverse=True))
    for i, k in enumerate(list(odct)):
        if i > 9:
            del odct[k]
    return odct

display_top_10_counts(raw_word_counts)

OrderedDict([('I', 104597),
             ('the', 98881),
             ('to', 91704),
             ('a', 81026),
             ('you', 60811),
             ('and', 54297),
             ('is', 53118),
             ('my', 44050),
             ('of', 42734),
             ('in', 39924)])

In [4]:
raw_char_counts = Counter()
raw_char_list = [char for char in joined_tweets]
for char in raw_char_list:
    raw_char_counts[char] += 1

display_top_10_counts(raw_char_counts)

OrderedDict([(' ', 5367665),
             ('e', 2485112),
             ('a', 2063593),
             ('o', 1841567),
             ('t', 1710262),
             ('i', 1560388),
             ('n', 1523182),
             ('s', 1380257),
             ('r', 1235665),
             ('l', 1044883)])

In [49]:
import emoji
import country_list

VALID_EMOJIS=[]
bad=[]
leftovers=sorted(emoji.core.unicode_codes.EMOJI_ALIAS_UNICODE.keys())
countries=[c[1] for c in country_list.countries_for_language('en')]
countries

terms=countries+[e for e in leftovers if 'flag' in e]+\
        [e for e in leftovers if 'regional_indicator' in e]
for t in terms:
    bad += [e for e in leftovers if t in e]
bad = sorted(list(set(bad)))

leftovers = [e for e in leftovers if not e in bad]

terms=['cry','disappoint','evil','face','facing','goblin','grin','head','heart','joy'
       'kiss','laugh','monkey','relieve','satisfied','smil','tears','tongue']
for t in terms:
    VALID_EMOJIS += [e for e in leftovers if t in e]
VALID_EMOJIS = sorted(list(set(VALID_EMOJIS)))

leftovers = [e for e in leftovers if not e in VALID_EMOJIS]

# terms = []
# for e in leftovers:
#     if '_dark_skin_tone:' in e:
#         terms.append(e.replace('_dark_skin_tone:',''))    
# for t in terms:
#     VALID_EMOJIS += [e for e in leftovers if t in e]
# VALID_EMOJIS = sorted(list(set(VALID_EMOJIS)))

VALID_EMOJIS = [emoji.emojize(e) for e in VALID_EMOJIS]

n_classes = len(VALID_EMOJIS)

# leftovers = [e for e in leftovers if not e in VALID_EMOJIS]

# terms=[e for e in leftovers if 'clock' in e]+\
#         [e for e in leftovers if 'arrow' in e]+\
#         [e for e in leftovers if 'key' in e]+\
#         [e for e in leftovers if 'circle' in e]+\
#         [e for e in leftovers if 'square' in e]+\
#         [e for e in leftovers if 'white' in e]+\
#         [e for e in leftovers if 'red' in e]+\
#         [e for e in leftovers if 'orange' in e]+\
#         [e for e in leftovers if 'yellow' in e]+\
#         [e for e in leftovers if 'green' in e]+\
#         [e for e in leftovers if 'blue' in e]+\
#         [e for e in leftovers if 'purple' in e]+\
#         [e for e in leftovers if 'black' in e]
# for i in range(10):
#     terms+=[e for e in leftovers if str(i) in e]

# for t in terms:
#     bad += [e for e in leftovers if t in e]
# bad = sorted(list(set(bad)))

# leftovers = [e for e in leftovers if not e in bad]
# len(VALID_EMOJIS)

raw_emoji_counts = {}
for char in raw_char_counts:
    if len(char) == 1:
        if char in VALID_EMOJIS:
            raw_emoji_counts[char] = raw_char_counts[char]
n_classes,raw_emoji_counts

(244,
 {'😂': 37839,
  '😕': 387,
  '😔': 3930,
  '😘': 2142,
  '🥺': 11082,
  '😭': 18224,
  '💙': 1617,
  '😤': 1230,
  '😛': 248,
  '🤦': 3335,
  '😋': 589,
  '😉': 905,
  '😣': 589,
  '🤣': 11327,
  '😁': 1187,
  '🤢': 685,
  '🥰': 3731,
  '💘': 466,
  '💝': 106,
  '😌': 1146,
  '💞': 630,
  '❤': 9533,
  '😍': 5668,
  '😃': 210,
  '🗣': 601,
  '😞': 897,
  '🤍': 765,
  '😐': 656,
  '🤔': 2163,
  '😡': 1020,
  '🙄': 2213,
  '🥱': 570,
  '💕': 2399,
  '😩': 3402,
  '💔': 1125,
  '😏': 842,
  '😖': 454,
  '😆': 813,
  '🙃': 1392,
  '🥳': 833,
  '👺': 31,
  '🐷': 74,
  '😳': 1913,
  '😫': 1023,
  '🥴': 2791,
  '😢': 1148,
  '🤥': 56,
  '💛': 932,
  '😷': 552,
  '🖤': 1045,
  '😧': 124,
  '😇': 373,
  '😅': 1646,
  '🤡': 776,
  '😈': 594,
  '😬': 802,
  '😊': 1297,
  '🤤': 1023,
  '😓': 433,
  '🤮': 776,
  '❣': 234,
  '💗': 986,
  '☹': 644,
  '🤧': 573,
  '🧐': 449,
  '😎': 824,
  '🥵': 807,
  '👅': 183,
  '🤫': 282,
  '😪': 771,
  '🤓': 212,
  '💓': 672,
  '🙁': 180,
  '🤭': 622,
  '🥶': 118,
  '💜': 1596,
  '♥': 1250,
  '🌞': 81,
  '😒': 1016,
  '🤪': 1168,
 

In [50]:
batch_size = 64
def split_train_valid_test(X,y=[]):
    percent_train = 0.9
    test_size = 50

    train_size = batch_size*int(percent_train*len(X)/batch_size)
    valid_size = len(X) - test_size - train_size
    assert(valid_size > 0)

    X_valid, y_valid = X[:valid_size], y[:valid_size]
    X_train, y_train = X[valid_size:-test_size], y[valid_size:-test_size]
    X_test, y_test = X[-test_size:], y[-test_size:]
#     assert((len(X_train)==len(y_train)) and
#            (len(X_valid)==len(y_valid)) and
#            (len(X_test)==len(y_test)))
    print(len(X_train)/len(X),len(X_valid)/len(X),len(X_test)/len(X),len(X_train)/batch_size)
    if y==[]:
        return X_train,X_valid,X_test
    else:
        return X_train,X_valid,X_test,y_train,y_valid,y_test

In [51]:
raw_tweets_train,raw_tweets_valid,raw_tweets_test = split_train_valid_test(raw_tweets)

0.8999672936542731 0.0999457213836872 8.698496203976257e-05 8083.0


In [52]:
def clean_tweets(X):
    cleaned_tweets = []
    i = 0
    for i, origtweet in enumerate(X):
        i += 1
        if i % (len(X)//10) == 0:
            print(str(10*i/(len(X)//10))+'% read')

        emojis = []
        tweet = ''
        no_emojis = True
        for e in raw_emoji_counts:
            if e in origtweet:
                no_emojis = False
                for char in origtweet:
                    if char == e:
                        tweet += ' '+char+' ' # this is so we can parse out the emoji

                        emojis.append(e)
                    else:
                        tweet += char
        if no_emojis:
            continue

        tweet_cleaned = []
        for word in clean_up_text(tweet).split(' '):
            if word[0] != '@':
                tweet_cleaned.append(word)
        tweet_cleaned = ' '.join(tweet_cleaned)

        cleaned_tweets.append(tweet_cleaned)
        
    return list(set(cleaned_tweets))

In [53]:
cleaned_tweets_train = clean_tweets(raw_tweets_train)
cleaned_tweets_valid = clean_tweets(raw_tweets_valid)
cleaned_tweets_test = clean_tweets(raw_tweets_test)

len(cleaned_tweets_train)

10.0% read
20.0% read
30.0% read
40.0% read
50.0% read
60.0% read
70.0% read
80.0% read
90.0% read
100.0% read
10.0% read
20.0% read
30.0% read
40.0% read
50.0% read
60.0% read
70.0% read
80.0% read
90.0% read
100.0% read
10.0% read
20.0% read
30.0% read
40.0% read
50.0% read
60.0% read
70.0% read
80.0% read
90.0% read
100.0% read


93757

In [54]:
def get_reduced_tweets_and__X_and__y(cleaned_tweets):
    reduced_tweets = []
    _X = []
    _y = []
    not_emojis = 0
    for i, tweet in enumerate(cleaned_tweets):
        emojis = []
        not_emoji = False
        for e in VALID_EMOJIS:
            if 'not '+e in tweet:
                not_emoji = True
        if not_emoji:
            not_emojis += 1
            continue
        for word in tweet.split(' '):
            for e in VALID_EMOJIS:
                if word == e:
                    emojis.append(e)
        # only include tweets that have exactly one of the selected emojis
        if len(set(emojis)) == 1:
            reduced_tweets.append(tweet)
            tweet_stripped = clean_up_text(tweet.replace(emojis[0],'')) # strip out the emoji
            _X.append(tweet_stripped)
            _y.append(emojis)
    return reduced_tweets, _X, _y, not_emojis
reduced_tweets_train, _X_train, _y_train, not_emojis_train = \
        get_reduced_tweets_and__X_and__y(cleaned_tweets_train)
reduced_tweets_valid, _X_valid, _y_valid, not_emojis_valid = \
        get_reduced_tweets_and__X_and__y(cleaned_tweets_valid)
reduced_tweets_test, _X_test, _y_test, not_emojis_test = \
        get_reduced_tweets_and__X_and__y(cleaned_tweets_test)

In [55]:
import numpy as np

# def get_most_frequent_keys(d,n):
#     a = np.array(list(d.values()))
#     thresh = min(a[np.argpartition(a,-n)][-n:])
#     inds = a > 0.9 * thresh
#     d = dict(np.array(list(d.items()))[inds])
#     for k in d:
#         d[k] = int(d[k])
#     odct = OrderedDict(sorted(d.items(),key=lambda x:x[1], reverse=True))
#     return odct

# n_classes = 10
# most_frequent_emojis_train = get_most_frequent_keys(emoji_counts_train, n_classes)
# most_frequent_emojis_valid = get_most_frequent_keys(emoji_counts_valid, n_classes)
# most_frequent_emojis_test = get_most_frequent_keys(emoji_counts_test, n_classes)
# n_classes = len(most_frequent_emojis_train)
# most_frequent_emojis_train,len(most_frequent_emojis_train)

emoji_counts = {}
for e in VALID_EMOJIS:
    emoji_counts[e] = 0

for tweet in cleaned_tweets_train:
    for e in VALID_EMOJIS:
        if e in tweet:
            emoji_counts[e] += 1

In [56]:
# get word counts and vocabulary size
words_list = ' '.join(_X_train+_X_valid+_X_test).split(' ')
word_counts = Counter()
for word in words_list:
    word_counts[word] += 1
    
word_counts = OrderedDict(sorted(word_counts.items()), key=lambda x: x[0], reverse=True)
    
vocabulary_size = len(word_counts)
vocabulary_size, word_counts

(127904,
 OrderedDict([('', 185),
              ('0', 28),
              ('00', 1),
              ('000', 2),
              ('0000', 2),
              ('0000\u200b', 1),
              ('001', 2),
              ('0039', 1),
              ('0040', 1),
              ('0048', 2),
              ('0064103276', 1),
              ('007', 1),
              ('007LolaMento', 1),
              ('007Shakur', 1),
              ('009Kingx', 1),
              ('00CHRISART', 1),
              ('00Hannie', 3),
              ('00L', 1),
              ('00coffee00', 1),
              ('01', 2),
              ('010', 1),
              ('0100', 1),
              ('01101100A', 1),
              ('0113saratan', 1),
              ('0116jay', 3),
              ('021', 1),
              ('0214yoonhong', 2),
              ('021perguntas', 2),
              ('029Triss', 1),
              ('02BTSlove', 1),
              ('02ss08', 3),
              ('03', 4),
              ('030', 1),
              ('031819', 1),
 

In [57]:
from keras.preprocessing import sequence
from keras.utils import to_categorical

def convert_to_one_hot(_y):
    #return enc.transform(np.array(_y).reshape(-1,1)).toarray()
    return to_categorical(_y, num_classes=n_classes)

word2id = {}
for i,word in enumerate(word_counts):
    word2id[word] = i

X_train = [[word2id[word] for word in tweet.split(' ')] for tweet in _X_train]
X_valid = [[word2id[word] for word in tweet.split(' ')] for tweet in _X_valid]
X_test = [[word2id[word] for word in tweet.split(' ')] for tweet in _X_test]

max_words = 140
X_train= sequence.pad_sequences(X_train, maxlen=max_words)
X_valid = sequence.pad_sequences(X_valid, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

emoji2id = {}
for i,e in enumerate(VALID_EMOJIS):
    emoji2id[e] = i
    
y_train = convert_to_one_hot([[emoji2id[e[0]]] for e in _y_train])
y_valid = convert_to_one_hot([[emoji2id[e[0]]] for e in _y_valid])
y_test = convert_to_one_hot([[emoji2id[e[0]]] for e in _y_test])

word2id[''], X_train[:10], y_train[:10]

(0,
 array([[     0,      0,      0, ...,  96385, 113757,  92611],
        [     0,      0,      0, ...,  68890, 115367,  52301],
        [     0,      0,      0, ...,      0,      0, 114296],
        ...,
        [     0,      0,      0, ...,  45135,  92289,  45225],
        [     0,      0,      0, ...,  74639,  91425,  60489],
        [     0,      0,      0, ...,  73482, 109120,  47358]], dtype=int32),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32))

This concludes the data grooming. Run the analysis!

In [58]:
from sklearn.utils import class_weight

y = [emoji2id[e[0]] for e in _y_train]

classes = np.unique(y)

dfsdf = {}
for e in VALID_EMOJIS:
    dfsdf[emoji2id[e]] = emoji2id[e] in classes
    
cw = class_weight.compute_class_weight('balanced', classes=classes, y=y)

class_weights = {}
i = 0
for e in VALID_EMOJIS:
    class_weights[emoji2id[e]] = 0
    if dfsdf[emoji2id[e]]:
        class_weights[emoji2id[e]] = cw[i]
        i += 1

len(classes),class_weights

(161,
 {0: 4.116356107660455,
  1: 18.29491603404647,
  2: 16.46542443064182,
  3: 4.4103815439219165,
  4: 0,
  5: 0,
  6: 12.66571110049371,
  7: 0.7730246211568931,
  8: 22.452851496329757,
  9: 2.72907587248207,
  10: 1.097694962042788,
  11: 0.7232250848012514,
  12: 1.2411123942694842,
  13: 23.52203490091689,
  14: 19.758509316770187,
  15: 2.520218025098238,
  16: 25.998038574697613,
  17: 1.4571172062514888,
  18: 7.84067830030563,
  19: 2.1476640561706724,
  20: 1.8362926874321734,
  21: 493.9627329192547,
  22: 0,
  23: 0,
  24: 0,
  25: 123.49068322981367,
  26: 5.3114272356909105,
  27: 0,
  28: 12.047871534615968,
  29: 0.8031914356410644,
  30: 29.056631348191452,
  31: 0,
  32: 0.9096919574940233,
  33: 0,
  34: 9.685543782730484,
  35: 14.113220940550134,
  36: 1.9071920189932612,
  37: 164.6542443064182,
  38: 1.1733081542025052,
  39: 1.7704757452303035,
  40: 1.6858796345367053,
  41: 0.5838802989589299,
  42: 1.6195499439975563,
  43: 2.3634580522452375,
  44: 2.33

In [59]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

def calc_n_neurons(alpha=2):
    Ni = model.layers[-1].input.shape[-1]
    No = model.layers[-1].output.shape[-1]
    n_neurons = max(int(len(X_train) / (alpha * (Ni + No))), 1)
    print(Ni,No,n_neurons)
    return n_neurons

model.add(LSTM(calc_n_neurons(), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(calc_n_neurons()))
model.add(Dropout(0.2))
model.add(Dense(n_classes, activation='softmax'))

model.summary()

140 4480 8
1120 1120 17
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 140, 32)           4092928   
_________________________________________________________________
lstm_9 (LSTM)                (None, 140, 8)            1312      
_________________________________________________________________
dropout_9 (Dropout)          (None, 140, 8)            0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 17)                1768      
_________________________________________________________________
dropout_10 (Dropout)         (None, 17)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 244)               4392      
Total params: 4,100,400
Trainable params: 4,100,400
Non-trainable params: 0
____________________

In [60]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [61]:
from keras.callbacks import ModelCheckpoint
filepath = 'temp.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
num_epochs = 25
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=batch_size,
          epochs=num_epochs, class_weight=class_weights, callbacks=[checkpoint])

Train on 79528 samples, validate on 8779 samples
Epoch 1/25

Epoch 00001: val_accuracy improved from -inf to 0.00342, saving model to temp.hdf5
Epoch 2/25

Epoch 00002: val_accuracy improved from 0.00342 to 0.06800, saving model to temp.hdf5
Epoch 3/25

Epoch 00003: val_accuracy did not improve from 0.06800
Epoch 4/25

Epoch 00004: val_accuracy did not improve from 0.06800
Epoch 5/25

Epoch 00005: val_accuracy did not improve from 0.06800
Epoch 6/25

Epoch 00006: val_accuracy did not improve from 0.06800
Epoch 7/25

Epoch 00007: val_accuracy did not improve from 0.06800
Epoch 8/25

Epoch 00008: val_accuracy did not improve from 0.06800
Epoch 9/25

Epoch 00009: val_accuracy did not improve from 0.06800
Epoch 10/25

Epoch 00010: val_accuracy did not improve from 0.06800
Epoch 11/25

Epoch 00011: val_accuracy did not improve from 0.06800
Epoch 12/25

Epoch 00012: val_accuracy did not improve from 0.06800
Epoch 13/25

Epoch 00013: val_accuracy did not improve from 0.06800
Epoch 14/25

Epoc

KeyboardInterrupt: 

In [None]:
id2emoji = {i:e for e,i in emoji2id.items()}
id2word = {i:word for word,i in word2id.items()}

model.load_weights(filepath)
for i in range(test_size):
    j = np.argmax(model.predict(np.array(X[-i-1])[np.newaxis]))
    print(id2emoji[j], reduced_tweets[-i-1])