# Bidrectional LSTM with pre-trained Twitter Word Embeddings

Cliche used an ensemble of bidirectional LSTMs along with CNNs to produce state of the art results in Twitter sentiment analysis. He trains initial word embeddings on a large, unlabled corpus of Twitter data using a neural language model. We will instead be using Stanford's pre-trained Glove word embeddings that were specifically trained on Twitter data. Since our training data is not very large, we anticipate that using these pre-trained word embeddings will result in an increase in performance. 

In [2]:
import pandas as pd
import re
import nltk
import string
import os
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import glob, os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [3]:
os.chdir("data/")

Helper methods for reading tweets and cleaning them.

In [43]:
def read_tsv(file_path):
    df = pd.read_table(file_path)
    return df

import string
import re

# code inspired from https://www.kaggle.com/rahulvv/bidirectional-lstm-glove200d


def remove_urls(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
  
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def split_text(text):
    text = text.split()
    return text

def lower(text):
    text = [word.lower() for word in text]
    return str(text)

def remove_punct(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', str(text))
    return text

def remove_stopwords(text):
    pattern = re.compile(r'\b('+r'|'.join(stopwords.words('english')) + r')\b\s*')
    text = pattern.sub(' ', text)
    return text

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    text = lemmatizer.lemmatize(text)
    return text

def clean_tweet(text):
    t0 = remove_urls(text)
    t1 = remove_html(t0)
    t2 = split_text(t1)
    t3 = lower(t2)
    t4 = remove_punct(t3)
    t5 = remove_stopwords(t4)
    t6 = lemmatize_words(t5)
    return t6

In [44]:
tweet_df = pd.DataFrame(columns=['tweet', 'sentiment','NA'])
df_test = pd.DataFrame(columns=['tweet', 'sentiment','NA'])

for file in glob.glob("*.tsv"):
        if 'final_test' in file:
            df_test_cur = read_tsv(file)
            df_test = pd.concat([df_test, df_test_cur])
        else:
            df_train_cur = read_tsv(file)
            tweet_df = pd.concat([tweet_df, df_train_cur])

In [45]:
print(tweet_df[['tweet', 'sentiment']] )

                                                  tweet sentiment
0     05 Beat it - Michael Jackson - Thriller (25th ...   neutral
1     Jay Z joins Instagram with nostalgic tribute t...  positive
2     Michael Jackson: Bad 25th Anniversary Edition ...   neutral
3     I liked a @YouTube video http://t.co/AaR3pjp2P...  positive
4     18th anniv of Princess Diana's death. I still ...  positive
...                                                 ...       ...
1137                     Maybe it was - his - fantasy ?  positive
1138  It was ok , but they always just seem so nervo...  negative
1139  It is streamable from YepRoc -- matter of fact...  positive
1140  comment telling me who you are , or how you fo...  positive
1141  im on myspace ... ill try and find you and add...   neutral

[53368 rows x 2 columns]


In [46]:
print(df_test[['tweet', 'sentiment']] )

                                                   tweet sentiment
0      #ArianaGrande Ari By Ariana Grande 80% Full ht...   neutral
1      Ariana Grande KIIS FM Yours Truly CD listening...  positive
2      Ariana Grande White House Easter Egg Roll in W...  positive
3      #CD #Musics Ariana Grande Sweet Like Candy 3.4...  positive
4      SIDE TO SIDE 😘 @arianagrande #sidetoside #aria...   neutral
...                                                  ...       ...
11901  @dansen17 update: Zac Efron kissing a puppy ht...  positive
11902  #zac efron sex pic skins michelle sex https://...   neutral
11903  First Look at Neighbors 2 with Zac Efron Shirt...   neutral
11904  zac efron poses nude #lovely libra porn https:...   neutral
11905  #Fashion #Style The Paperboy (NEW Blu-ray Disc...   neutral

[11906 rows x 2 columns]


Reading Glove word embeddings into a dictionary.

In [47]:
#preparing train lables
tweet_df.loc[tweet_df.sentiment == "positive", "sentiment"] = 2
tweet_df.loc[tweet_df.sentiment == "neutral", "sentiment"] = 1
tweet_df.loc[tweet_df.sentiment == "negative", "sentiment"] = 0

labels = tweet_df["sentiment"].tolist()
labels = [ int(x) for x in labels ]

#preparing test labels
df_test.loc[df_test.sentiment == "positive", "sentiment"] = 2
df_test.loc[df_test.sentiment == "neutral", "sentiment"] = 1
df_test.loc[df_test.sentiment == "negative", "sentiment"] = 0

labels_test = df_test["sentiment"].tolist()
labels_test = [ int(x) for x in labels_test ]

Converting tweets and labels into lists.

In [48]:
train_tweets = tweet_df.tweet.values
y_train_orig = tweet_df.sentiment.values
test_tweets = df_test.tweet.values

In [49]:
from keras.utils import to_categorical

train_labels = to_categorical(y_train_orig)

clean_training_tweets = []
for i in range(len(train_tweets)):
    data = clean_tweet(train_tweets[i])
    clean_training_tweets.append(data)

clean_testing_tweets = []
for i in range(len(test_tweets)):
    data = clean_tweet(test_tweets[i])
    clean_testing_tweets.append(data)

Checking the tweets after cleaning them.

In [50]:
print(clean_training_tweets[:10])
print(clean_testing_tweets[:10])

[' beat  michael jackson  thriller th anniversary edition hd', 'jay z joins instagram  nostalgic tribute  michael jackson jay z apparently joined instagram  saturday  ', 'michael jackson bad th anniversary edition picture vinyl  unique picture disc vinyl includes  original ', ' liked  youtube video one direction singing man   mirror  michael jackson  atlanta ga june ', 'th anniv  princess dianas death  still want  believe   living   private island away   public  michael jackson', 'oridaganjazz  st time  heard michael jackson sing   honolulu hawaii   restaurant  radio   abc    loved  ', 'michael jackson appeared  saturday    th place   top  miamis trends trndnl', '  old enough  remember michael jackson attending  grammys  brooke shields  webster sat   lap   show', 'etbowser  u enjoy  nd rate michael jackson bit honest ques like  cant feel face song  god   obvious  want mj ', ' weeknd   closest thing  may get  michael jackson   long timeespecially since  damn near mimics everything']
['a

In [20]:
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('../glove/glove.twitter.27B.200d.txt'), encoding = "utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 1193514 word vectors.


In [52]:
# converting tweets to integer sequences 
tokenizer = Tokenizer(num_words= 20000, oov_token= 'OOV')
tokenizer.fit_on_texts(clean_training_tweets)
train_tweet_sequences = tokenizer.texts_to_sequences(clean_training_tweets)
word_index_train = tokenizer.word_index
print('Found %s unique words in train tweets.' % len(word_index_train))
X_train = pad_sequences(sequences=train_tweet_sequences, maxlen=32, padding= 'post', truncating='post')


test_tweet_sequences = tokenizer.texts_to_sequences(clean_testing_tweets)
X_test = pad_sequences(sequences= test_tweet_sequences, maxlen=32, padding='post', truncating='post')

Found 67101 unique words in train tweets.


In [53]:
print('Shape of X train tensor: ', X_train.shape)
print('Shape of X test: ', X_test.shape)

Shape of X train tensor:  (53368, 32)
Shape of X test:  (11906, 32)


In [54]:
num_words = min(20000, len(word_index_train)+1)
embedding_matrix = np.zeros((num_words, 200))

embeddings = []
for word, i in word_index_train.items():
    if i<20000:
        embeddings = word2vec.get(word)
        if embeddings is not None:
            embedding_matrix[i] = embeddings

In [17]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=num_words,output_dim = 200, weights=[embedding_matrix], input_length=32,trainable=False))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.01), metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 200)           4000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 200)           240800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 32, 64)            59648     
_________________________________________________________________
flatten (Flatten)            (None, 2048)              0         
_________________________________________________________________
dense (Dense)                (None, 3)                 6147      
Total params: 4,306,595
Trainable params: 306,595
Non-trainable params: 4,000,000
_________________________________________________________________


In [19]:
history=model.fit(X_train, train_labels, batch_size=128, epochs=15)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [20]:
pred_p = model.predict(X_test)

In [21]:
pred = (np.round(pred_p)).astype(int)
final_pred = []
for sample in pred:
    pred_label = sample.argmax()
    final_pred.append(pred_label)

In [22]:
y_binary = to_categorical(labels_test)
model.evaluate(x = X_test, y =y_binary )



[1.6070550479709491, 0.58793885]

In [23]:
from sklearn.metrics import classification_report
print(classification_report(labels_test, final_pred))

              precision    recall  f1-score   support

           0       0.58      0.61      0.59      3811
           1       0.62      0.58      0.60      5743
           2       0.51      0.56      0.53      2352

    accuracy                           0.58     11906
   macro avg       0.57      0.58      0.58     11906
weighted avg       0.59      0.58      0.58     11906



In [24]:
# Calling `save('my_model')` creates a SavedModel folder `my_model`.
model.save("bidirectional-lstm")

In [56]:
# It can be used to reconstruct the model identically.
reconstructed_model = keras.models.load_model("bidirectional-lstm")
y_binary = to_categorical(labels_test)
reconstructed_model.evaluate(x = X_test, y =y_binary)



[1.6070550479709491, 0.58793885]

In [60]:
from sklearn.metrics import classification_report
pred_p = reconstructed_model.predict(X_test)

pred = (np.round(pred_p)).astype(int)
final_pred = []
for sample in pred:
    pred_label = sample.argmax()
    final_pred.append(pred_label)
    
print(classification_report(labels_test, final_pred))

              precision    recall  f1-score   support

           0       0.58      0.61      0.59      3811
           1       0.62      0.58      0.60      5743
           2       0.51      0.56      0.53      2352

    accuracy                           0.58     11906
   macro avg       0.57      0.58      0.58     11906
weighted avg       0.59      0.58      0.58     11906

