This notebook contains code that performs a dynamic sentiment analysis based on emojis.

Emojis are extracted from tweets, and the tweet text is used to predict the emoji.

In [None]:
import glob
import json
import string

def get_full_text(tweet):
    if 'extended_tweet' in tweet:
        return tweet['extended_tweet']['full_text']
    else:
        return tweet['text']
    
def clean_up_text(text):
    text = text.replace('\n','').replace('\r','').replace('\t','')
    while '  ' in text:
        text = text.replace('  ',' ')
    return text.strip()
        
def get_orig_text(tweet):
    if 'retweeted_status' in tweet:
        return get_orig_text(tweet['retweeted_status'])
    else:
        return clean_up_text(get_full_text(tweet))
    
directory = [f for f in glob.iglob('Downloads/BackLAOut/*')]
i = 0
raw_tweets = []
for filepath in directory:
    if (i > 0) and (i % (len(directory)//10) == 0):
        print(str(i)+' of '+str(len(directory))+' files read, '+
              str(len(raw_tweets))+' total tweets')
    file = open(filepath, 'r')
    for line in file:
        tweet = json.loads(line)
        text = get_orig_text(tweet)
        if ('http://' not in text) and ('https://' not in text):
            text = text.translate(str.maketrans('', '', string.punctuation))
            raw_tweets.append(text)
    file.close()
    i += 1
raw_tweets[:10]

In [None]:
raw_tweets = list(set(raw_tweets))
joined_tweets = ' '.join(raw_tweets)

len(raw_tweets)

In [None]:
from collections import Counter, OrderedDict

raw_word_counts = Counter()
raw_word_list = joined_tweets.split(' ')
for word in raw_word_list:
    raw_word_counts[word] += 1
    
def display_top_10_counts(d):
    odct = OrderedDict(sorted(d.items(), key=lambda x: x[1], reverse=True))
    for i, k in enumerate(list(odct)):
        if i > 9:
            del odct[k]
    return odct

display_top_10_counts(raw_word_counts)

In [None]:
raw_char_counts = Counter()
raw_char_list = [char for char in joined_tweets]
for char in raw_char_list:
    raw_char_counts[char] += 1

display_top_10_counts(raw_char_counts)

In [None]:
raw_emoji_counts = {}
for char in raw_char_counts:
    if len(char) == 1:
        if ord(char) > 99999:
            raw_emoji_counts[char] = raw_char_counts[char]

display_top_10_counts(raw_emoji_counts)

In [None]:
cleaned_tweets = []
emoji_counts = Counter()
i = 0
for i, origtweet in enumerate(raw_tweets):
    i += 1
    if i % (len(raw_tweets)//10) == 0:
        print(str(10*i/(len(raw_tweets)//10))+'% read')
        
    emojis = []
    tweet = ''
    no_emojis = True
    for emoji in raw_emoji_counts:
        if emoji in origtweet:
            no_emojis = False
            for char in origtweet:
                if char == emoji:
                    tweet += ' '+char+' ' # this is so we can parse out the emoji

                    emojis.append(emoji)
                else:
                    tweet += char
    if no_emojis:
        continue
                
    tweet_cleaned = []
    for word in clean_up_text(tweet).split(' '):
        if word[0] != '@':
            tweet_cleaned.append(word)
    tweet_cleaned = ' '.join(tweet_cleaned)
    
    cleaned_tweets.append(tweet_cleaned)
    
    for emoji in set(emojis):
        emoji_counts[emoji] += 1
        
cleaned_tweets = list(set(cleaned_tweets))

len(cleaned_tweets), display_top_10_counts(emoji_counts)

In [None]:
import numpy as np

def get_most_frequent_keys(d,n):
    a = np.array(list(d.values()))
    thresh = min(a[np.argpartition(a,-n)][-n:])
    inds = a > 0.9 * thresh
    d = dict(np.array(list(d.items()))[inds])
    for k in d:
        d[k] = 1 / int(d[k])
    return d

n_classes = 10
most_frequent_emojis = get_most_frequent_keys(emoji_counts, n_classes)
n_classes = len(most_frequent_emojis)
most_frequent_emojis

In [None]:
reduced_tweets = []
_X = []
_y = []
not_emojis = 0
for i, tweet in enumerate(cleaned_tweets):
    emojis = []
    not_emoji = False
    for emoji in most_frequent_emojis:
        if 'not '+emoji in tweet:
            not_emoji = True
    if not_emoji:
        not_emojis += 1
        continue
    for word in tweet.split(' '):
        for emoji in most_frequent_emojis:
            if word == emoji:
                emojis.append(emoji)
    # only include tweets that have exactly one of the selected emojis
    if len(set(emojis)) == 1:
        reduced_tweets.append(tweet)
        tweet_stripped = clean_up_text(tweet.replace(emojis[0],'')) # strip out the emoji
        _X.append(tweet_stripped)
        _y.append(emojis)
not_emojis, reduced_tweets[:5], _X[:5]

In [None]:
# get word counts and vocabulary size
words_list = ' '.join(_X).split(' ')
word_counts = Counter()
for word in words_list:
    word_counts[word] += 1
    
word_counts = OrderedDict(sorted(word_counts.items()), key=lambda x: x[1], reverse=True)
    
vocabulary_size = len(word_counts)
vocabulary_size, word_counts

In [None]:
from keras.preprocessing import sequence
from keras.utils import to_categorical

def convert_to_one_hot(_y):
    #return enc.transform(np.array(_y).reshape(-1,1)).toarray()
    return to_categorical(_y, num_classes=n_classes)

word2id = {}
for i,word in enumerate(word_counts):
    word2id[word] = i

X = [[word2id[word] for word in tweet.split(' ')] for tweet in _X]

max_words = 140
X = sequence.pad_sequences(X, maxlen=max_words)

emoji2id = {}
for i,emoji in enumerate(most_frequent_emojis):
    emoji2id[emoji] = i
    
y = convert_to_one_hot([[emoji2id[emoji[0]]] for emoji in _y])

word2id[''], X[:10], y[:10]

This concludes the data grooming. Run the analysis!

In [None]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense

embedding_size=100
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(200, return_sequences=True))
model.add(LSTM(200, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(n_classes, activation='softmax'))

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [None]:
percent_validation = 0.2
batch_size = 64
test_size = len(X)//3
tv_size = len(X) - test_size
v_size = tv_size-batch_size*int((1-percent_validation)*tv_size/batch_size)
X_valid, y_valid = X[:v_size], y[:v_size]
X_train, y_train = X[v_size:tv_size], y[v_size:tv_size]
X_test, y_test = X[tv_size:], y[tv_size:]
assert((len(X_train)==len(y_train)) and
       (len(X_valid)==len(y_valid)) and
       (len(X_test)==len(y_test)))
len(X_train)/len(X),len(X_valid)/len(X),len(X_test)/len(X),len(X_train)/batch_size

In [None]:
class_weight = np.array([[emoji2id[k],v] for k,v in most_frequent_emojis.items()])
class_weight[:,1] /= np.exp(np.mean(np.log(class_weight[:,1])))
class_weight = dict(class_weight)
class_weight

In [None]:
from keras.callbacks import ModelCheckpoint
filepath = 'temp.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
num_epochs = 50
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), batch_size=batch_size,
          epochs=num_epochs, class_weight=class_weight, callbacks=[checkpoint])

In [None]:
id2emoji = {i:emoji for emoji,i in emoji2id.items()}
id2word = {i:word for word,i in word2id.items()}

model.load_weights(filepath)
for i in range(test_size):
    j = np.argmax(model.predict(np.array(X[-i-1])[np.newaxis]))
    print(id2emoji[j], reduced_tweets[-i-1])