In [1]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.2.2
Num GPUs Available:  2


In [2]:
import os
use_gpu = True
if use_gpu:
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
    # The GPU id to use, usually either "0" or "1";
    os.environ["CUDA_VISIBLE_DEVICES"] = "0";

In [3]:
from han.model import HAN

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils.np_utils import to_categorical
from nltk import tokenize

In [6]:
#Importing the dataset

dataset = pd.read_csv('../IMDB Review - LSTM with Attention/imdb_master.csv', encoding = "ISO-8859-1")
dataset.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [7]:
EMB_SIZE = 300
MAX_FEATURES = 100000 # how many unique words to use (i.e num rows in embedding vector)
MAX_LEN = 100 # Maximum length for texts
MAX_SENT = 10
EMBEDDING_FILE = '../BIGRU-Attention_visualized/glove.840B.300d/glove.840B.300d.txt'

In [8]:
#Splitting into training and test set
dataset = dataset.drop(['Unnamed: 0', 'file'], axis = 1)
dataset = dataset[dataset.label != 'unsup']
dataset['label'] = dataset['label'].map({'pos': 1, 'neg': 0})
dataset_test = dataset[dataset['type'] == 'test']
dataset_train = dataset[dataset['type'] == 'train']

#X_test = dataset_test.iloc[:, 1:2].values
#y_test = dataset_test.iloc[:, 2].values
#X_train = dataset_train.iloc[:, 1:2].values
#y_train = dataset_train.iloc[:, 2].values

In [10]:
texts = []
reviews = []
labels = []
for idx in dataset_train.index:
    texts.append(dataset_train.review[idx])
    sentences = tokenize.sent_tokenize(dataset_train.review[idx])
    reviews.append(sentences)
    labels.append(dataset_train.label[idx])

In [11]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(texts)

In [12]:
data = np.zeros((len(texts), MAX_SENT, MAX_LEN), dtype='int32')

In [13]:
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENT:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_LEN and tokenizer.word_index[word] < MAX_FEATURES:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

In [14]:
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of reviews (data) tensor:', data.shape)
print('Shape of sentiment (label) tensor:', labels.shape)

Total 88334 unique tokens.
Shape of reviews (data) tensor: (25000, 10, 100)
Shape of sentiment (label) tensor: (25000, 2)


In [15]:
validation_split = 0.2

In [18]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in training and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Number of positive and negative reviews in training and validation set
[ 9979. 10021.]
[2521. 2479.]


In [19]:
glove_dir = '../BIGRU-Attention_visualized/glove.840B.300d/'
EMBEDDING_FILE = 'glove.840B.300d.txt'

In [20]:
embeddings_index = {}
f = open(os.path.join(glove_dir, EMBEDDING_FILE))
for line in f:
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        pass
f.close()

In [21]:
embedding_dim = 300
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
embedding_matrix.shape

(88335, 300)

In [22]:
han = HAN(embedding_matrix, max_sent_length=MAX_LEN, max_sent_num=15)
han.print_summary()

Word Level
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          26500500  
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 100)          105600    
_________________________________________________________________
time_distributed (TimeDistri (None, 100, 100)          10100     
_________________________________________________________________
word_attention (Attention)   (None, 100)               10200     
Total params: 26,626,400
Trainable params: 125,900
Non-trainable params: 26,500,500
_________________________________________________________________
Sentence Level
Model: "model_1"
_________________________________________________________________


In [24]:
checkpoint_path = './'

In [25]:
han.train_model(checkpoint_path, x_train, y_train, x_val, y_val)

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.52192, saving model to ./
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./assets
Epoch 2/10
Epoch 00002: val_loss improved from 0.52192 to 0.41436, saving model to ./
INFO:tensorflow:Assets written to: ./assets
Epoch 3/10
Epoch 00003: val_loss improved from 0.41436 to 0.38043, saving model to ./
INFO:tensorflow:Assets written to: ./assets
Epoch 4/10
Epoch 00004: val_loss improved from 0.38043 to 0.36082, saving model to ./
INFO:tensorflow:Assets written to: ./assets
Epoch 5/10
Epoch 00005: val_loss improved from 0.36082 to 0.36009, saving model to ./
INFO:tensorflow:Assets written to: ./assets
Epoch 6/10

KeyboardInterrupt: 

In [64]:
def word_att_to_df(sent_tokenized_review, word_att):
    """Convert the word attention arrays into pandas dataframe.

    Args:
        sent_tokenized_review: sentence tokenized review, which means sent_tokenize(review)
            has to be executed beforehand. And only one review is allowed, since it's
            on word attention level, and also it's the required input size in
            self.show_word_attention, but review can contain multiple sentences.
        word_att: attention weights obtained from self.show_word_attention.

    Returns:
        df: pandas.DataFrame, contains original reviews column and word_att column,
            and word_att column is a list of dictionaries in which word as key while
            corresponding weight as value.
    """
    # remove the trailing dot
    ori_sents = [i.rstrip('.') for i in sent_tokenized_review]
    # split sentences into words
    ori_words = [x.split() for x in ori_sents]
    # truncate attentions to have equal size of number of words per sentence
    truncated_att = [i[:len(k)] for i, k in zip(word_att, ori_words)]

    # create word attetion pair as dictionary
    word_att_pair = []
    for i, j in zip(truncated_att, ori_words):
        word_att_pair.append(dict(zip(j, i)))

    return pd.DataFrame([(x, y) for x, y in zip(word_att_pair, ori_words)],
                        columns=['word_att', 'review'])

In [84]:
line=4
X = x_train[line:line+1]
han.model.predict(X), y_train[line]

(array([[0.9332636 , 0.06673647]], dtype=float32),
 array([1., 0.], dtype=float32))

In [85]:
X = x_train[line:line+1]
sent_att = han.show_sent_attention(X)
sent_tokenized_reviews = [tokenizer.sequences_to_texts(X[0])]
res = han.sent_att_to_df(sent_tokenized_reviews, sent_att)
res['sent_att'][0]


The following Variables were used a Lambda layer's call (lambda_18), but
are not present in its tracked objects:
  <tf.Variable 'sent_attention/W:0' shape=(100, 100) dtype=float32>
  <tf.Variable 'sent_attention/bias:0' shape=(100,) dtype=float32>
  <tf.Variable 'sent_attention/context_vector:0' shape=(100,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


[{'sometimes you wonder how some people get funding to create a movie as bad as this one': 0.33137226},
 {'you can only stand about 5 minutes of this utter piece of garbage before you stomp back into blockbuster and demand your money back': 0.33253688},
 {'i will now look at michael clarke duncan with apprehension why he lent his name to this vermin': 0.26695547},
 {'': 0.017948981},
 {'': 0.010750748},
 {'': 0.008599101},
 {'': 0.007722782},
 {'': 0.0074265976},
 {'': 0.007684304},
 {'': 0.009002919}]

In [86]:
X = x_train[line]
word_att = han.show_word_attention(X)
sent_tokenized_review = tokenizer.sequences_to_texts(X)

res = word_att_to_df(sent_tokenized_review, word_att)
res

The following Variables were used a Lambda layer's call (lambda_19), but
are not present in its tracked objects:
  <tf.Variable 'word_attention/W:0' shape=(100, 100) dtype=float32>
  <tf.Variable 'word_attention/bias:0' shape=(100,) dtype=float32>
  <tf.Variable 'word_attention/context_vector:0' shape=(100,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


Unnamed: 0,word_att,review
0,"{'sometimes': 0.0970492, 'you': 0.05971423, 'w...","[sometimes, you, wonder, how, some, people, ge..."
1,"{'you': 0.031306367, 'can': 0.032320075, 'only...","[you, can, only, stand, about, 5, minutes, of,..."
2,"{'i': 0.101176225, 'will': 0.038010504, 'now':...","[i, will, now, look, at, michael, clarke, dunc..."
3,{},[]
4,{},[]
5,{},[]
6,{},[]
7,{},[]
8,{},[]
9,{},[]


In [87]:
res['word_att'][0]

{'sometimes': 0.0970492,
 'you': 0.05971423,
 'wonder': 0.07688511,
 'how': 0.065937005,
 'some': 0.06308151,
 'people': 0.049403578,
 'get': 0.027143007,
 'funding': 0.022601131,
 'to': 0.018076822,
 'create': 0.025095986,
 'a': 0.048973765,
 'movie': 0.087236926,
 'as': 0.04986466,
 'bad': 0.1333325,
 'this': 0.039167803,
 'one': 0.023914803}

In [63]:
# remove the trailing dot
ori_sents = [i.rstrip('.') for i in sent_tokenized_review]
ori_words = [x.split() for x in ori_sents]
# truncate attentions to have equal size of number of words per sentence
truncated_att = [i[:len(k)] for i, k in zip(word_att, ori_words)]
truncated_att

[array([0.05813286, 0.03906306, 0.05714861, 0.04202345, 0.01552521,
        0.02846979, 0.02747771, 0.02220716, 0.03549959, 0.03695996,
        0.03711698, 0.04200925, 0.0235565 , 0.04255753, 0.0676116 ,
        0.07111941, 0.05248561, 0.0175731 , 0.0228399 , 0.02068773,
        0.01858273, 0.04009108, 0.03199178, 0.02384928, 0.04577547,
        0.01633245, 0.02111959, 0.01378319], dtype=float32),
 array([0.07990089, 0.09992402, 0.11308374, 0.1032338 , 0.13684615,
        0.14657366, 0.0914412 , 0.06709768, 0.03685163, 0.0169361 ,
        0.01950104, 0.02044013], dtype=float32),
 array([0.06071807, 0.06673838, 0.08979163, 0.06033099, 0.03629679,
        0.05039718, 0.0693971 , 0.08074046, 0.07927234, 0.05090144,
        0.02532975, 0.04707265, 0.03145685, 0.04224804, 0.06088854,
        0.04207875, 0.03435449, 0.01727689], dtype=float32),
 array([0.21620734, 0.21242759, 0.13868198, 0.0753084 , 0.08488233],
       dtype=float32),
 array([0.07894012, 0.11042175, 0.05704086, 0.05062545, 0

In [None]:
        # split sentences into words
        
        

        # create word attetion pair as dictionary
        word_att_pair = []
        for i, j in zip(truncated_att, ori_words):
            word_att_pair.append(dict(zip(j, i)))

        return pd.DataFrame([(x, y) for x, y in zip(word_att_pair, ori_words)],
                            columns=['word_att', 'review'])