## Tweet Emotion Classification with LSTM
Hao Wang (haowang@ece.utoronto.ca)
April 26, 2018 

In [10]:
import pandas as pd
import numpy as np
import os, sys
import csv
pd.set_option('display.max_colwidth', -1)

### Prepare the data

In [11]:
data = pd.read_csv('../data/parsed/dev-labeled.csv')

In [12]:
emo_to_label = {
    'anger': 0, 
    'fear': 1, 
    'joy': 2, 
    'love': 3, 
    'sadness': 4, 
    'surprise': 5, 
    'thankfulness': 6 
}

C = len(emo_to_label)

In [14]:
data['label'] = data['emotion'].apply(lambda emo: emo_to_label[emo])

In [15]:
m = data.shape[0]

### Preprocessing

#### Cleaning:
1. Remove @userid
2. Remove hashtags
3. Emoji
4. URL
5. Remove non-English tweets by langid library

#### Tokenization & Normalization
1. Penn Treebank tokenization

In [16]:
import re
from nltk.tokenize.treebank import TreebankWordTokenizer

In [17]:
# clean
def preprocess_tweet(tweet):        
#     # all caps
#     allcaps_regex = re.compile(r"([^a-z0-9()<>'`\-]){2,}")     
#     tweet = re.sub(allcaps_regex, '\1' + ' <allcaps> ', tweet)   
    
    # lowercase
    tweet = tweet.lower()
    
    slash_regex = re.compile(r"/")
    user_regex = re.compile(r"@[\S]+")
    hash_regex = re.compile(r"#(\w+)")
    url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")
    
    emoji_heart_regex = re.compile(r"<3")
    emoji_smile1_regex = re.compile(r"[8:=;]['`\-]?[)d]+|[)d]+['`\-]?[8:=;]")
    emoji_smile2_regex = re.compile(r"\^(_|\.)\^")
    emoji_lol_regex = re.compile(r"[8:=;]['`\-]?p+")
    emoji_sad1_regex = re.compile(r"[8:=;]['`\-]?\(+|\)+['`\-]?[8:=;]")
    emoji_sad2_regex = re.compile(r">(_|.)<")
    emoji_neutral_regex = re.compile(r"[8:=;]['`\-]?[\/|l*]")

    number_regex = re.compile(r"[-+]?[.\d]*[\d]+[:,.\d]*")
    
    # repeating punctuations 
    rpt_punc_regex = re.compile(r"([!?.])\1{1,}")
    # repeating words like hurrrryyyyyy
    rpt_word_regex = re.compile(r"\b(\S*?)(.)\2{2,}\b", re.IGNORECASE)

    tweet = re.sub(url_regex, ' <url> ', tweet)
    tweet = re.sub(slash_regex, ' / ', tweet)
    tweet = re.sub(user_regex, ' <user> ', tweet)
    tweet = re.sub(hash_regex, ' <hashtag> ', tweet)
    
    tweet = re.sub(emoji_heart_regex, ' <heart> ', tweet)
    tweet = re.sub(emoji_smile1_regex, ' <smile> ', tweet)
    tweet = re.sub(emoji_smile2_regex, ' <smile> ', tweet)
    tweet = re.sub(emoji_lol_regex, ' <lolface> ', tweet)
    tweet = re.sub(emoji_sad1_regex, ' <sadface> ', tweet)
    tweet = re.sub(emoji_sad2_regex, ' <sadface> ', tweet)
    tweet = re.sub(emoji_neutral_regex, ' <neutralface> ', tweet)
    
    tweet = re.sub(number_regex, ' <number> ', tweet)

    tweet = re.sub(rpt_punc_regex, r' \1' + ' <repeat> ', tweet)
    tweet = re.sub(rpt_word_regex, r'\1' + r'\2' + ' <elong> ', tweet)
    
    # split punctuation and words
    word_bound_regex = re.compile(r"(\w+)([.,!,?]+)")
    tweet = re.sub(word_bound_regex, r'\1' + r' \2', tweet)
    
    tokenizer = TreebankWordTokenizer()
    # to keep <> special word 
    tokenizer.PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}]'), r' \g<0> ')
    tweet_toks = tokenizer.tokenize(tweet, convert_parentheses=False)
    
    return tweet_toks

In [18]:
s = "damn hope i don't get out of all https://haow.ca these <3 problems coming into my future #worried"
s = "Cold...Soar throat...Cofee...#feelinSick...#Sad"
s = "Getting REALLY excited about @ixdconf. Passport is processing, flights are scheduled, & living is booked. What to do? #Excited #Awesomesauce"
s = "Have a good night champ (: @JohnCena u were amazing as always #proud &lt;3"
# s = "Done! I'm GOING home..#happiness"
preprocess_tweet(s)

['have',
 'a',
 'good',
 'night',
 'champ',
 '(',
 ':',
 '<user>',
 'u',
 'were',
 'amazing',
 'as',
 'always',
 '<hashtag>',
 '&',
 'lt',
 ';',
 '<number>']

Skip the next step -- filtering non-English tweets. Our vocabulary has multiple languages

In [37]:
# filter non-English tweets
import langid
EN_THRESHOLD = -420
langid.set_languages(['en'])
# langid.classify("yeah you should! And me too from high school!")
# langid.classify("dors biien en tout cas merci d'etre la pour nous t'est genial tout simplement jtai kiffé dans l'episode d'aujourd'hui")
# langid.classify("I understand that they want us to feel something and be motivated to donate but do all the Bernados adverts have to be traumatising? #sad")
data[data['tweet'].apply(lambda t: langid.classify(t)[1] < EN_THRESHOLD)]

('en', -549.9064557552338)

In [19]:
data['tweet_tk'] = data['tweet'].apply(lambda t: preprocess_tweet(t))

In [20]:
# data = data.drop(['tweet_vt', 'tweet_tk'], axis=1)

### Vectorizing

We use GloVe, a global vectors for word representation (https://nlp.stanford.edu/projects/glove/).

Dataset: Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB download): glove.twitter.27B.zip

In [21]:
GLOVE_FILE = '/mnt/tweets/glove/glove.twitter.27B.200d.txt'
dim = 200

def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(GLOVE_FILE)

In [22]:
word_to_index['<unknown>']

1998

In [23]:
def tk_tweet_to_indices(tweet):    
    return [word_to_index.get(w, word_to_index['<unknown>']) for w in tweet]

In [24]:
data['tweet_vt'] = data['tweet_tk'].apply(lambda t: tk_tweet_to_indices(t))

In [25]:
data.head()

Unnamed: 0,tid,tweet,emotion,label,tweet_tk,tweet_vt
0,137144184007180288,i came to a realization that i am happiest i have been in a very long time. i got those two nigas i love ;) @rachelpazz & myboy;) #happy.,joy,2,"[i, came, to, a, realization, that, i, am, happiest, i, have, been, in, a, very, long, time, ., i, got, those, two, nigas, i, love, <smile>, <user>, &, myboy, <smile>, <hashtag>, .]","[266800, 93727, 607686, 2114, 509441, 601404, 266800, 22926, 249553, 266800, 251959, 60650, 273725, 2114, 639335, 346612, 605074, 1818, 266800, 235699, 603199, 621962, 411912, 266800, 348020, 1983, 2002, 1635, 393756, 1983, 1940, 1818]"
1,147516714798678016,"Getting REALLY excited about @ixdconf. Passport is processing, flights are scheduled, & living is booked. What to do? #Excited #Awesomesauce",joy,2,"[getting, really, excited, about, <user>, passport, is, processing, ,, flights, are, scheduled, ,, &, living, is, booked, ., what, to, do, ?, <hashtag>, <hashtag>]","[228211, 509497, 195367, 4800, 2002, 466425, 283379, 493890, 1736, 209209, 34877, 539424, 1736, 1635, 343883, 283379, 78345, 1818, 652233, 607686, 162203, 2039, 1940, 1940]"
2,149410954986270720,1st Driving lesson in a minute #excited,joy,2,"[<number>, st, driving, lesson, in, a, minute, <hashtag>]","[1964, 572545, 167196, 339063, 273725, 2114, 380171, 1940]"
3,142720791077863424,Have a good night champ (: @JohnCena u were amazing as always #proud &lt;3,joy,2,"[have, a, good, night, champ, (, :, <user>, u, were, amazing, as, always, <hashtag>, &, lt, ;, <number>]","[251959, 2114, 234424, 412044, 103498, 1663, 1837, 2002, 623838, 651479, 23586, 37723, 22567, 1940, 1635, 348857, 1887, 1964]"
4,135950336207761409,"@topmodel_29 HOLD UP flag on the play...i feel some type of way.I've called you twice, once on your #, left 2 vm and you tweeting smh #hurt",sadness,4,"[<user>, hold, up, flag, on, the, play, ., <repeat>, i, feel, some, type, of, way, .i, 've, called, you, twice, ,, once, on, your, #, ,, left, <number>, vm, and, you, tweeting, smh, <hashtag>]","[2002, 260403, 629676, 208533, 451193, 601626, 482461, 1818, 1973, 266800, 203433, 566116, 622375, 446382, 649583, 1998, 1662, 93021, 668737, 620959, 1736, 451328, 451193, 669224, 3, 1736, 337252, 1964, 643248, 26337, 668737, 620565, 562443, 1940]"


### Padding



In [26]:
from keras.preprocessing.sequence import pad_sequences
max_len = max(data['tweet_vt'].apply(lambda t: len(t)))

X_indices = pad_sequences(data['tweet_vt'], max_len, padding='post')

In [35]:
X_indices.shape

(138533, 121)

### Embedding

In [28]:
np.random.seed(0)
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

In [29]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1                  
    emb_dim = word_to_vec_map["cucumber"].shape[0]      
        
    emb_matrix = np.zeros((vocab_len, emb_dim))
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    embedding_layer = Embedding(vocab_len, emb_dim, trainable=True)
    embedding_layer.build((None,))
    
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [30]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

### 2-layer LSTM Model

In [31]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation

In [45]:
def TweetEmotion(input_shape, word_to_vec_map, word_to_index):
    
    sentence_indices = Input(shape=input_shape, dtype='int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128, return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(C, activation='softmax')(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices, X)
        
    return model

In [46]:
model = TweetEmotion((max_len,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 121)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 121, 200)          238702800 
_________________________________________________________________
lstm_3 (LSTM)                (None, 121, 128)          168448    
_________________________________________________________________
dropout_3 (Dropout)          (None, 121, 128)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 903       
__________

In [47]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [89]:
def convert_to_one_hot(Y, C=7):
    m = Y.shape[0]
    oh_matrix = np.zeros((m,7))
    
    for i, v in data['label'].iteritems():
        oh_matrix[i][v] = 1    
    
    return oh_matrix

In [90]:
Y_oh = convert_to_one_hot(data['label'])

In [98]:
data_train_ratio = 0.8
m = X_indices.shape[0]
X_train_indices = X_indices[:round(m*data_train_ratio)]
Y_train_oh = Y_oh[:round(m*data_train_ratio)]

In [None]:
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

  "This may consume a large amount of memory." % num_elements)


Epoch 1/50
 16608/110826 [===>..........................] - ETA: 1:00:13 - loss: 1.8521 - acc: 0.2786