https://www.kaggle.com/crowdflower/twitter-airline-sentiment

#### Import modules

In [1]:
%matplotlib inline

In [2]:
from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, sys, urllib
from tqdm import tqdm_notebook
from zipfile import ZipFile

from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

np.random.seed(0)

Using TensorFlow backend.


In [3]:
tweets=pd.read_csv('Tweets_preprocess.csv', index_col=0)

In [4]:
train_X, test_X, train_Y, test_Y=train_test_split(tweets['text_clean'], tweets['sentiment'], test_size=0.1)

In [5]:
train_X.shape, train_Y.shape

((13176,), (13176,))

In [6]:
test_X.shape, test_Y.shape

((1464,), (1464,))

In [7]:
maxLen = len(max(train_X, key=len).split())

# 4. Model - RNN

## 4.1 Load pre-trained word embedding 
Because word embeddings are very computionally expensive to train, I will use 50-dimensional GloVe vectors to represent words. It is a pre-trained set of word embeddings

**References to**:
- The GloVe word embeddings were due to Jeffrey Pennington, Richard Socher, and Christopher D. Manning. (https://nlp.stanford.edu/projects/glove/)

In [8]:
if not os.path.exists('glove.6B.50d.txt'):
    with ZipFile('glove.6B.50d.txt.zip', 'r') as zipObj:
        zipObj.extractall()

In [9]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [10]:
word_to_index, index_to_word, word_to_vec_map=read_glove_vecs('glove.6B.50d.txt')

In [11]:
print('The pre-trained set contains',len(word_to_index),'words')

The pre-trained set contains 400000 words


In [14]:
print('Each word is represend as a', word_to_vec_map['dummy'].shape[0], 'dimension vector')

Each word is represend as a 50 dimension vector


## 4.2 Preprocess

Convert each tweet to its glove-represented index list

In [18]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    
    for i in range(m): 
        sentence_words = [w.lower() for w in X[i].split()]
        j = 0
        for w in sentence_words:
            if w not in word_to_index:
                break
            X_indices[i, j] = word_to_index[w]
            j += 1

    return X_indices

In [16]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [None]:
train_X_indices=sentences_to_indices(train_X.values, word_to_index, maxLen)
test_X_indices=sentences_to_indices(test_X.values, word_to_index, maxLen)

train_Y_oh=convert_to_one_hot(train_Y, C=3)
test_Y_oh=convert_to_one_hot(test_Y, C=3)

## 4.3 Build model

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    """
    vocab_len = len(word_to_index) + 1 
    emb_dim = word_to_vec_map["dummy"].shape[0]
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [None]:
def model_rnn(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(shape=input_shape, dtype='int32')
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)
    
    X=LSTM(128, return_sequences=True)(embeddings)
    X=Dropout(0.5)(X)
    
    X=LSTM(128, return_sequences=False)(X)
    X=Dropout(0.5)(X)

    X=Dense(3)(X)
    X=Activation('softmax')(X)
    
    model = Model(inputs=sentence_indices, outputs=X)    
    return model

In [None]:
model=model_rnn((maxLen,), word_to_vec_map, word_to_index)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(train_X_indices, train_Y_oh, epochs=50, batch_size=32, shuffle=True)

In [None]:
loss, acc=model.evaluate(test_X_indices, test_Y_oh)