In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import time
import os

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.contrib.rnn.python.ops import core_rnn_cell
from nltk.corpus import stopwords
import nbimporter
import re
from functions import prep
import statistics
import math
from customLSTMcell import CustomCell
from customLSTMcell import LayerNormalizedLSTMCell

Importing Jupyter notebook from functions.ipynb


In [2]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 18
FORWARD_STEPS = 1
BATCH_SIZE = 32
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_epochs = 20
num_hidden = 100

In [3]:
# read data from twitter
tweet_data   = pd.read_csv('datasets/us_airline.csv')
columns = ['text', 'airline_sentiment']
tweet_data_extract = tweet_data[:][columns]
tweet_data_extract.dropna()
tweet_data_extract["clean_text"] = tweet_data_extract["text"].map(lambda x: prep.text_to_wordlist(x))
tweet_data_extract["labels"] = tweet_data_extract["airline_sentiment"].map(lambda x: prep.sentiment_to_label(x, ['neutral', 'positive', 'negative']))
print('Found %s tweets' % len(tweet_data_extract["clean_text"]))

Found 14640 tweets


In [4]:
# tokenize tweets
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(tweet_data_extract["clean_text"])
sequence_tweets = tokenizer.texts_to_sequences(tweet_data_extract["clean_text"])
sequence_tweets_pad = pad_sequences(sequence_tweets, MAX_SEQUENCE_LENGTH)
word_index = tokenizer.word_index
print('Found %s tokens' % len(word_index))

Found 15074 tokens


In [5]:
GloVe_file = "datasets/glove.6B/glove.6B.300d.txt"
word2vec_file = "datasets/glove_word2vec.txt"
embedding_matrix = prep.word2vec_GloVe(GloVe_file, word2vec_file, word_index=word_index)
nb_words = embedding_matrix.shape[0]

In [6]:
train_seq = sequence_tweets_pad
label_seq = tweet_data_extract["labels"]
np_train_seq = np.array(list(train_seq), dtype='int32')
np_label_seq = np.array(list(label_seq), dtype='int32')

########################################
## sample train/validation data
########################################
np.random.seed(1234)
perm = np.random.permutation(len(np_train_seq))
idx_train = perm[:int(len(np_train_seq)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(np_train_seq)*(1-VALIDATION_SPLIT)):]

data_train = np_train_seq[idx_train]
data_val = np_train_seq[idx_val]

labels_train = np_label_seq[idx_train]
labels_val = np_label_seq[idx_val]

print('dtype of train data: %s' % data_train.dtype)
print('dtype of train label: %s' % labels_train.dtype)

print('shape of train data: ', data_train.shape)
print('shape of train label: ', labels_train.shape)

dtype of train data: int32
dtype of train label: int32
shape of train data:  (13176, 18)
shape of train label:  (13176,)


In [7]:
steps_per_train_epoch = int(data_train.shape[0]/BATCH_SIZE)+1
steps_per_valid_epoch = int(data_val.shape[0]/BATCH_SIZE)+1
print('TRIAN STEP %d' % steps_per_train_epoch)
print('VALID STEP %d' % steps_per_valid_epoch)

data = {'train': data_train, 'labels': labels_train, 'embedding': embedding_matrix}

TRIAN STEP 412
VALID STEP 46


In [8]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
    
def train_network(g, data, num_epochs, batch_size = 32, verbose = True, save=False, pretrain_model=None):
    tf.set_random_seed(2345)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        if pretrain_model is not None:
            ckpt = tf.train.get_checkpoint_state(os.path.dirname(pretrain_model))
            if ckpt and ckpt.model_checkpoint_path:
                print("Loading Model...")
                g["saver"].restore(sess, ckpt.model_checkpoint_path)
                print("Pre-trained Model Loaded")
        
        training_losses = []
        for epoch in range(num_epochs):
            training_loss = 0
            training_state = None
            for step in range(steps_per_train_epoch):
                offset = (step*batch_size) % (data['train'].shape[0]-batch_size)
                batch_data = data['train'][offset:(offset+batch_size)]
                batch_labels = data['labels'][offset:(offset+batch_size)]

                feed_dict={g['x']: batch_data, g['y_sentiment']: batch_labels, g['embeddings']: data['embedding']}
                if training_state is not None:
                    feed_dict[g['init_state']] = training_state
                training_loss_, training_state, rnn_outputs_reshape, _ = sess.run([g['total_loss'],
                                              g['final_state'],  
                                              g['rnn_outputs_reshape'],            
                                              g['train_step_unfreeze']],
                                              feed_dict)
                training_loss += training_loss_
                if step==0:
                    if epoch == 0:
                        rnn_difference = np.array(rnn_outputs_reshape)
                    else:
                        rnn_mean_diff = np.mean(np.square(np.array(rnn_outputs_reshape) - rnn_difference))
                        print("difference of rnn_ouputs: %f" % rnn_mean_diff)
                
            if verbose:
                print("Average training loss for Epoch", epoch, ":", training_loss/steps_per_train_epoch)
            training_losses.append(training_loss/steps_per_train_epoch)

        if isinstance(save, str):
            g['saver'].save(sess, save, global_step=(epoch+1))
            
    return training_losses

In [9]:
def build_graph(
    cell_type = None,
    num_weights_for_custom_cell = 5,
    state_size = EMBEDDING_DIM,
    num_classes = nb_words,
    num_labels = 3,
    batch_size = BATCH_SIZE,
    num_steps = MAX_SEQUENCE_LENGTH,
    num_layers = 3,
    build_with_dropout=False,
    build_with_stopgradient = False,
    learning_rate = 1e-4):

    reset_graph()
    
    with tf.variable_scope("global"):
        with tf.variable_scope("awd_lstm"):
            x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
            y_word = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')
            y_sentiment = tf.placeholder(tf.int32, [batch_size], name='sentiments_placeholder')
            embeddings = tf.placeholder(tf.float32, [num_classes, state_size], name='embeddings_placeholder')
            dropout = tf.constant(1.0)

            #embeddings = tf.get_variable('embedding_matrix', [nb_words, num_classes])
            rnn_inputs = tf.nn.embedding_lookup(embeddings, x)

            if cell_type == 'Custom':
                cell = CustomCell(state_size, num_weights_for_custom_cell)
            elif cell_type == 'GRU':
                cell = tf.nn.rnn_cell.GRUCell(state_size)
            elif cell_type == 'LSTM':
                cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)
            elif cell_type == 'LN_LSTM':
                cell = LayerNormalizedLSTMCell(state_size)
            else:
                cell = tf.nn.rnn_cell.BasicRNNCell(state_size)

            if build_with_dropout:
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=dropout)

            if cell_type == 'LSTM' or cell_type == 'LN_LSTM':
                cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
            else:
                cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)

            if build_with_dropout:
                cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=dropout)

            init_state = cell.zero_state(batch_size, tf.float32)
            rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)
            rnn_outputs = tf.identity(rnn_outputs, 'rnn_outputs_tensor')
            
        #-------------------------------- NEXT WORDING PREDICTION --------------------------------------------    
        with tf.variable_scope('word_prediction'):
            W = tf.get_variable('W', [state_size, num_classes])
            b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
            rnn_outputs_reshape = tf.reshape(rnn_outputs, [-1, state_size])
            word_logits = tf.matmul(rnn_outputs_reshape, W) + b
            word_predictions = tf.nn.softmax(word_logits, name='predictions')
            y_reshaped = tf.reshape(y_word, [-1])
            word_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_reshaped, logits=word_logits, name="word_cross_entropy")
            word_cross_entropy_mean = tf.reduce_mean(word_cross_entropy, name="word_cross_entropy_mean")
            word_train_step = tf.train.AdamOptimizer(learning_rate).minimize(word_cross_entropy_mean)    

        #------------------------------- SENTIMENT CLASSIFICATION --------------------------------------------
        with tf.variable_scope('sentiment_softmax'):
            W_s = tf.get_variable('W_s', [state_size, num_labels])
            b_s = tf.get_variable('b_s', [num_labels], initializer=tf.constant_initializer(0.0))
            rnn_outputs_mean = tf.reduce_mean(rnn_outputs, 1)
            sentiment_logits = tf.matmul(rnn_outputs_mean, W_s) + b_s
            sentiment_predictions = tf.nn.softmax(sentiment_logits, name='predictions_sentiment')
            sentiment_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_sentiment, logits=sentiment_logits, name="cross_entropy_sentiment")
            sentiment_cross_entropy_mean = tf.reduce_mean(sentiment_cross_entropy, name="cross_entropy_sentiment_mean")
          
        freeze_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="global/sentiment_softmax")
        sentiment_train_step_freeze = tf.train.AdamOptimizer(learning_rate).minimize(sentiment_cross_entropy_mean, var_list=freeze_train_vars)  
        sentiment_train_step_unfreeze = tf.train.AdamOptimizer(learning_rate).minimize(sentiment_cross_entropy_mean)  
    
    return dict(
        x = x,
        y_word = y_word,
        y_sentiment = y_sentiment,
        embeddings = embeddings,
        init_state = init_state,
        final_state = final_state,
        total_loss = sentiment_cross_entropy_mean,
        train_step_word = word_train_step,
        train_step_freeze = sentiment_train_step_freeze,
        train_step_unfreeze = sentiment_train_step_unfreeze,
        rnn_outputs_reshape = rnn_outputs_reshape,
        preds = sentiment_predictions,
        saver = tf.train.Saver()
    )

In [13]:
g = build_graph(cell_type='LN_LSTM', num_steps=MAX_SEQUENCE_LENGTH)
pretrain_model = "base_model/LN_LSTM_word_prediction.ckpt"
save = "benchmark_model/LN_LSTM_sentiment.ckpt"
t = time.time()
losses = train_network(g, data, 20)
print("It took", time.time() - t, "seconds to train for 10 epochs.")
print("The average loss on the final epoch was:", losses[-1])

Average training loss for Epoch 0 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 1 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 2 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 3 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 4 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 5 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 6 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 7 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 8 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 9 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 10 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 11 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 12 : nan
difference of rnn_ouputs: nan
Average training loss for Epoch 13 : nan
difference of rnn_ouputs: nan
Average training