This notebook contains code that performs a dynamic sentiment analysis based on emojis.

Emojis are extracted from tweets, and the tweet text is used to predict the emoji.

In [1]:
import glob
import json
import string

def get_full_text(tweet):
    if 'extended_tweet' in tweet:
        return tweet['extended_tweet']['full_text']
    else:
        return tweet['text']
    
def clean_up_text(text):
    text = text.replace('\n','').replace('\r','').replace('\t','')
    while '  ' in text:
        text = text.replace('  ',' ')
    return text.strip()
        
def get_orig_text(tweet):
    if 'retweeted_status' in tweet:
        return get_orig_text(tweet['retweeted_status'])
    else:
        return clean_up_text(get_full_text(tweet))
    
directory = [f for f in glob.iglob('Downloads/BackLAOut/*')]
i = 0
raw_tweets = []
for filepath in directory:
    if len(raw_tweets) > 2e5: break # comment out when running full dataset

    if (i > 0) and (i % (len(directory)//10) == 0):
        print(str(i)+' of '+str(len(directory))+' files read, '+
              str(len(raw_tweets))+' total tweets')
    file = open(filepath, 'r')
    for line in file:
        tweet = json.loads(line)
        text = get_orig_text(tweet)
        if ('http://' not in text) and ('https://' not in text):
            text = text.translate(str.maketrans('', '', string.punctuation))
            raw_tweets.append(text)
    file.close()
    i += 1
raw_tweets[:10]

['x00Al13N because you know its so hard to go to the atm to get cash right',
 'Boys who still kiss your hand or your forehead out of nowhere are ANGELS',
 'Might fuck around and sleep for the rest of the day 🥴',
 'My head is constantly spinning of what to do with my life',
 'Tell me your Animal Crossing Island Name and Ill give you something to watch',
 'I miss going to target for no reason',
 'Brooklyne Thanks friend',
 'davidfrum Cool Now do Iraq Lives Matter Dave',
 'jzimmermann11 CervezaTeresa Hey Ill be there',
 'Won’t stop til I win']

In [2]:
raw_tweets = list(set(raw_tweets))
joined_tweets = ' '.join(raw_tweets)

len(raw_tweets)

125074

In [3]:
from collections import Counter, OrderedDict

raw_word_counts = Counter()
raw_word_list = joined_tweets.split(' ')
for word in raw_word_list:
    raw_word_counts[word] += 1
    
def display_top_10_counts(d):
    odct = OrderedDict(sorted(d.items(), key=lambda x: x[1], reverse=True))
    for i, k in enumerate(list(odct)):
        if i > 9:
            del odct[k]
    return odct

display_top_10_counts(raw_word_counts)

OrderedDict([('I', 23154),
             ('the', 22475),
             ('to', 20808),
             ('a', 18527),
             ('you', 13925),
             ('and', 12459),
             ('is', 12283),
             ('my', 9854),
             ('of', 9723),
             ('in', 8916)])

In [4]:
raw_char_counts = Counter()
raw_char_list = [char for char in joined_tweets]
for char in raw_char_list:
    raw_char_counts[char] += 1

display_top_10_counts(raw_char_counts)

OrderedDict([(' ', 1193069),
             ('e', 551229),
             ('a', 451274),
             ('o', 407634),
             ('t', 381024),
             ('i', 345602),
             ('n', 338037),
             ('s', 306044),
             ('r', 273016),
             ('l', 230014)])

In [5]:
batch_size = 64
def split_train_valid_test(X,y=[]):
    percent_train = 0.9
    test_size = 50

    train_size = batch_size*int(percent_train*len(X)/batch_size)
    valid_size = len(X) - test_size - train_size
    assert(valid_size > 0)

    X_valid, y_valid = X[:valid_size], y[:valid_size]
    X_train, y_train = X[valid_size:-test_size], y[valid_size:-test_size]
    X_test, y_test = X[-test_size:], y[-test_size:]
#     assert((len(X_train)==len(y_train)) and
#            (len(X_valid)==len(y_valid)) and
#            (len(X_test)==len(y_test)))
    print(len(X_train)/len(X),len(X_valid)/len(X),len(X_test)/len(X),len(X_train)/batch_size)
    if y==[]:
        return X_train,X_valid,X_test
    else:
        return X_train,X_valid,X_test,y_train,y_valid,y_test

In [6]:
raw_tweets_train,raw_tweets_valid,raw_tweets_test = split_train_valid_test(raw_tweets)

0.8995634584326079 0.10003677822728944 0.0003997633401026592 1758.0


In [7]:
emoji_sentiment = {'☹': -1,
                   '☺':  1,
                   '♥':  1,
                   '❣':  1,
                   '❤':  1,
                   '🐱':  1,
                   '🐵':  0,
                   '👅':  1,
                   '👺': -1,
                   '👿': -1,
                   '💑':  1,
                   '💓':  1,
                   '💔': -1,
                   '💕':  1,
                   '💖':  1,
                   '💗':  1,
                   '💘':  1,
                   '💙': -1,
                   '💚':  1,
                   '💛':  1,
                   '💜':  1,
                   '💝':  1,
                   '💞':  1,
                   '💟':  1,
                   '🖤': -1,
                   '😀':  1,
                   '😁':  0,
                   '😂':  0,
                   '😃':  1,
                   '😄':  1,
                   '😅':  0,
                   '😆':  0,
                   '😇':  1,
                   '😈':  1,
                   '😉':  1,
                   '😊':  1,
                   '😋':  1,
                   '😌':  1,
                   '😍':  1,
                   '😎':  1,
                   '😏':  1,
                   '😐':  0,
                   '😑': -1,
                   '😒':  0,
                   '😓': -1,
                   '😔': -1,
                   '😕': -1,
                   '😖': -1,
                   '😗':  1,
                   '😘':  1,
                   '😙':  1,
                   '😚':  1,
                   '😛':  1,
                   '😜':  1,
                   '😝':  1,
                   '😞': -1,
                   '😟': -1,
                   '😠': -1,
                   '😡': -1,
                   '😢': -1,
                   '😣': -1,
                   '😤': -1,
                   '😥': -1,
                   '😦': -1,
                   '😧': -1,
                   '😨': -1,
                   '😩': -1,
                   '😪':  0,
                   '😫': -1,
                   '😬':  0,
                   '😭': -1,
                   '😮':  0,
                   '😯':  0,
                   '😰': -1,
                   '😱':  0,
                   '😲': -1,
                   '😳': -1,
                   '😴':  0,
                   '😵': -1,
                   '😶':  0,
                   '😷':  0,
                   '😸':  1,
                   '😹':  0,
                   '😺':  1,
                   '😻':  1,
                   '😼':  1,
                   '😽':  1,
                   '😾': -1,
                   '😿': -1,
                   '🙀':  0,
                   '🙁': -1,
                   '🙂':  1,
                   '🙃':  1,
                   '🙄':  0,
                   '🙈':  0,
                   '🙉':  0,
                   '🙊':  0,
                   '🤍':  1,
                   '🤎':  1,
                   '🤐':  0,
                   '🤑':  1,
                   '🤒': -1,
                   '🤓':  0,
                   '🤔':  0,
                   '🤕': -1,
                   '🤖':  0,
                   '🤗':  1,
                   '🤛':  1,
                   '🤛🏻':  1,
                   '🤛🏼':  1,
                   '🤛🏽':  1,
                   '🤛🏾':  1,
                   '🤛🏿':  1,
                   '🤜':  1,
                   '🤜🏻':  1,
                   '🤜🏼':  1,
                   '🤜🏽':  1,
                   '🤜🏾':  1,
                   '🤜🏿':  1,
                   '🤠':  1,
                   '🤡':  1,
                   '🤢': -1,
                   '🤣':  0,
                   '🤤':  1,
                   '🤥': -1,
                   '🤦': -1,
                   '🤦🏻': -1,
                   '🤦🏼': -1,
                   '🤦🏽': -1,
                   '🤦🏾': -1,
                   '🤦🏿': -1,
                   '🤧':  0,
                   '🤨':  0,
                   '🤪':  0,
                   '🤫':  0,
                   '🤬': -1,
                   '🤭':  1,
                   '🤮': -1,
                   '🤯':  0,
                   '🥰':  1,
                   '🥱':  0,
                   '🥳':  1,
                   '🥴':  1,
                   '🥵': -1,
                   '🥶': -1,
                   '🥺': -1,
                   '🧐':  0,
                   '🧡':  1}
def clean_tweets(X):
    cleaned_tweets = []
    i = 0
    for i, origtweet in enumerate(X):
        i += 1
        if i % (len(X)//10) == 0:
            print(str(10*i/(len(X)//10))+'% read')

        emojis = []
        tweet = ''
        no_emojis = True
        for e in emoji_sentiment:
            if e in origtweet:
                no_emojis = False
                for char in origtweet:
                    if char == e:
                        tweet += ' '+char+' ' # this is so we can parse out the emoji

                        emojis.append(e)
                    else:
                        tweet += char
        if no_emojis:
            continue

        tweet_cleaned = []
        for word in clean_up_text(tweet).split(' '):
            if word[0] != '@':
                tweet_cleaned.append(word)
        tweet_cleaned = ' '.join(tweet_cleaned)

        cleaned_tweets.append(tweet_cleaned)
        
    return list(set(cleaned_tweets))

In [8]:
cleaned_tweets_train = clean_tweets(raw_tweets_train)
cleaned_tweets_valid = clean_tweets(raw_tweets_valid)
cleaned_tweets_test = clean_tweets(raw_tweets_test)

len(cleaned_tweets_train)

10.0% read
20.0% read
30.0% read
40.0% read
50.0% read
60.0% read
70.0% read
80.0% read
90.0% read
100.0% read
10.0% read
20.0% read
30.0% read
40.0% read
50.0% read
60.0% read
70.0% read
80.0% read
90.0% read
100.0% read
10.0% read
20.0% read
30.0% read
40.0% read
50.0% read
60.0% read
70.0% read
80.0% read
90.0% read
100.0% read


19496

In [9]:
def get__X_and__y(cleaned_tweets):
    _X = []
    _y = []
    not_emojis = 0
    for i, tweet in enumerate(cleaned_tweets):
        sentiments = []
        not_emoji = False
        for e in emoji_sentiment:
            if 'not '+e in tweet:
                not_emoji = True
        if not_emoji:
            not_emojis += 1
            continue
        emojis = []
        for word in tweet.split(' '):
            for e in emoji_sentiment:
                if word == e:
                    sentiments.append(emoji_sentiment[e])
                    emojis.append(e)
        # only include tweets that have exactly one of the selected emojis
        if len(set(sentiments)) == 1:
            for e in emojis:
                tweet = tweet.replace(e,'')
            _X.append(clean_up_text(tweet))
            _y.append(sentiments[0])
    return _X, _y, not_emojis
_X_train, _y_train, not_emojis_train = get__X_and__y(cleaned_tweets_train)
_X_valid, _y_valid, not_emojis_valid = get__X_and__y(cleaned_tweets_valid)
_X_test, _y_test, not_emojis_test = get__X_and__y(cleaned_tweets_test)

In [10]:
import numpy as np

# def get_most_frequent_keys(d,n):
#     a = np.array(list(d.values()))
#     thresh = min(a[np.argpartition(a,-n)][-n:])
#     inds = a > 0.9 * thresh
#     d = dict(np.array(list(d.items()))[inds])
#     for k in d:
#         d[k] = int(d[k])
#     odct = OrderedDict(sorted(d.items(),key=lambda x:x[1], reverse=True))
#     return odct

# n_classes = 10
# most_frequent_emojis_train = get_most_frequent_keys(emoji_counts_train, n_classes)
# most_frequent_emojis_valid = get_most_frequent_keys(emoji_counts_valid, n_classes)
# most_frequent_emojis_test = get_most_frequent_keys(emoji_counts_test, n_classes)
# n_classes = len(most_frequent_emojis_train)
# most_frequent_emojis_train,len(most_frequent_emojis_train)

emoji_counts = {-1:0,0:0,1:0}

for i in _y_train:
    emoji_counts[i] += 1
emoji_counts

{-1: 6183, 0: 6202, 1: 5340}

In [11]:
# get word counts and vocabulary size
words_list = ' '.join(_X_train+_X_valid+_X_test).split(' ')
word_counts = Counter()
for word in words_list:
    word_counts[word] += 1
    
word_counts = OrderedDict(sorted(word_counts.items()), key=lambda x: x[0], reverse=True)
    
vocabulary_size = len(word_counts)
vocabulary_size, word_counts

(40356,
 OrderedDict([('', 77),
              ('0', 9),
              ('000', 1),
              ('0000', 1),
              ('0116jay', 1),
              ('02BTSlove', 1),
              ('02ss08', 1),
              ('03', 1),
              ('03malli02', 1),
              ('03smk', 1),
              ('04', 1),
              ('0415Kamen', 1),
              ('05', 1),
              ('050246135042', 3),
              ('0818Yukichi', 2),
              ('08monica20', 1),
              ('08rcarvalho', 1),
              ('0SweetSolace0', 1),
              ('0Swixi0', 1),
              ('0VNSC', 1),
              ('0hMeli', 1),
              ('0l0lD', 1),
              ('0toEli', 2),
              ('1', 73),
              ('10', 52),
              ('100', 20),
              ('1000', 5),
              ('10000', 1),
              ('100000', 1),
              ('10000Hours', 1),
              ('1000PM', 1),
              ('1000mg', 1),
              ('1000pm', 1),
              ('10010💎', 1),
      

In [12]:
from keras.preprocessing import sequence
from keras.utils import to_categorical

def convert_to_one_hot(_y):
    #return enc.transform(np.array(_y).reshape(-1,1)).toarray()
    return to_categorical(_y, num_classes=3)

word2id = {}
for i,word in enumerate(word_counts):
    word2id[word] = i

X_train = [[word2id[word] for word in tweet.split(' ')] for tweet in _X_train]
X_valid = [[word2id[word] for word in tweet.split(' ')] for tweet in _X_valid]
X_test = [[word2id[word] for word in tweet.split(' ')] for tweet in _X_test]

max_words = 140
X_train= sequence.pad_sequences(X_train, maxlen=max_words)
X_valid = sequence.pad_sequences(X_valid, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

y_train = convert_to_one_hot(_y_train)
y_valid = convert_to_one_hot(_y_valid)
y_test = convert_to_one_hot(_y_test)

X_train[:10], y_train[:10]

Using TensorFlow backend.


(array([[    0,     0,     0, ..., 15298, 36197, 31727],
        [    0,     0,     0, ..., 19593, 34494, 33465],
        [    0,     0,     0, ..., 18872, 22033, 23099],
        ...,
        [    0,     0,     0, ..., 19973, 21466, 38916],
        [    0,     0,     0, ...,     0, 33525, 22323],
        [    0,     0,     0, ...,     0,     0, 22996]], dtype=int32),
 array([[0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.]], dtype=float32))

This concludes the data grooming. Run the analysis!

In [13]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced', classes=np.arange(3)-1, y=_y_train)

class_weights

array([0.95557712, 0.95264968, 1.10642946])

In [14]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

def calc_n_neurons():
    alpha = np.log(len(X_train))
    Ni = model.layers[-1].input.shape[-1]
    No = model.layers[-1].output.shape[-1]
    n_neurons = max(int(2 * len(X_train) / (alpha * (Ni + No))), 1)
    print(alpha,Ni,No,n_neurons)
    return n_neurons

model.add(LSTM(calc_n_neurons(), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(calc_n_neurons()))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.summary()

9.782731351400328 140 32 21
9.782731351400328 21 21 86
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 140, 32)           1291392   
_________________________________________________________________
lstm_1 (LSTM)                (None, 140, 21)           4536      
_________________________________________________________________
dropout_1 (Dropout)          (None, 140, 21)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 86)                37152     
_________________________________________________________________
dropout_2 (Dropout)          (None, 86)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 261       
Total params: 1,333,341
Trainable params: 1,333,341
Non-trainable

In [15]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [16]:
from keras.callbacks import ModelCheckpoint
filepath = 'temp.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
num_epochs = 25
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=batch_size,
          epochs=num_epochs, class_weight=class_weights, callbacks=[checkpoint])



Train on 17725 samples, validate on 2065 samples
Epoch 1/25

Epoch 00001: val_accuracy improved from -inf to 0.48184, saving model to temp.hdf5
Epoch 2/25

Epoch 00002: val_accuracy improved from 0.48184 to 0.51429, saving model to temp.hdf5
Epoch 3/25

Epoch 00003: val_accuracy did not improve from 0.51429
Epoch 4/25

Epoch 00004: val_accuracy did not improve from 0.51429
Epoch 5/25

Epoch 00005: val_accuracy did not improve from 0.51429
Epoch 6/25

Epoch 00006: val_accuracy did not improve from 0.51429
Epoch 7/25
 4096/17725 [=====>........................] - ETA: 32s - loss: 0.0516 - accuracy: 0.9839

KeyboardInterrupt: 

In [21]:
id2word = {i:word for word,i in word2id.items()}

model.load_weights(filepath)
for i,x in enumerate(X_test):
    j = np.argmax(model.predict(np.array(x)[np.newaxis]))-1
    print(j, cleaned_tweets_test[i])

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      