In [None]:
# revised data_prepare for attention visulization

In [1]:
# the following codes are packaged as a py file that can be import in other python scripts. 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
import string
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
# function for text cleaning
def clean_text(text):
    ## Remove puncuation
    ## Convert words to lower case and split them
    # replace non-readable apostrophes
    # replace contractions of sequences as its original form .
    text = text.lower().replace("′", "'").replace("’", "'")\
   .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
   .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
   .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
   .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
   .replace("'ll", " will").replace("how's", "how is").replace("where's", "where is")
    #text = text.translate(string.punctuation)
    token = word_tokenize(text)
    text = text.translate(string.punctuation)
    tokens2 = word_tokenize(text)
    words = [word for word in tokens2 if word.isalpha()] # remove non-alphabatical words

    ## Remove stop words
    stops = set(stopwords.words("english"))
    word_filter = [w for w in words if not w in stops and len(w) >= 3]
    text_return = " ".join(word_filter)
    return text_return,token
# function for data loading
def load_data(dataset,data_size):
    articles = []
    token_all = []
    i = 0
    with open(dataset) as f:
        lines = f.readlines()
        for item in lines[-data_size:]:
            #print(item)
            seq,token= clean_text(item)
            articles.append(seq)
            token_all.append(token)
            i += 1
            if i%100 == 0:
                print(str(i))
    return articles,token_all
# function for labels loading
def load_labels(labelset,data_size):
    labels = []
    with open(labelset) as f:
        lines = f.readlines()
        for item in lines[-data_size:]:
            labels.append(int(item.split("\n")[0]))
    labels = np.asarray(labels)
    return labels
# function for tokenization and padding
def tokenize(articles,vocabulary_size,sequence_length,load_model,tokenizer):
    if load_model == False:
        tokenizer = Tokenizer(num_words= vocabulary_size)
        tokenizer.fit_on_texts(articles) # create a dictionary that has words in articles as key and the index as value
    sequences = tokenizer.texts_to_sequences(articles)
    data = pad_sequences(sequences, maxlen = sequence_length,padding='pre', truncating='post') # pad the article that has less words than the sequence_length to sequence_length using zeros
    return data,tokenizer
# function for data spliting
def split_data(data,label_array,train_size):
    X_train = data[0:train_size,:]
    y_train = label_array[0:train_size]

    X_test = data[train_size:,:]
    y_test = label_array[train_size:]
    return X_train, y_train, X_test, y_test
# call data prepare functions and return results
def data_ready(dataset, labelset,data_size,vocabulary_size,sequence_length,train_size,load_model,tokenizer):
    articles,token = load_data(dataset,data_size)
    label_array = load_labels(labelset,data_size)
    data,tokenizer = tokenize(articles,vocabulary_size,sequence_length,load_model,tokenizer)
    X_train, y_train, X_test, y_test = split_data(data, label_array, train_size)
    return X_train, y_train, X_test, y_test, tokenizer

# function for pre-trained word embedding loading
# tokenizer is needed here to map the word in google w2v file with the words in the articles.
def load_w2v(w2v_file, binary, vocabulary_size, embedding_dim,tokenizer):
    word_vectors = KeyedVectors.load_word2vec_format(w2v_file, binary=binary)
    w2v = word_vectors

    embeddings_index = dict()
    vocab = w2v.vocab.keys()
    for word in vocab:
        coefs = np.asarray(w2v.word_vec(word), dtype='float32')
        embeddings_index[word] = coefs

    embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
    for word, index in tokenizer.word_index.items():
        if index > vocabulary_size - 1:
            continue
        else:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
    return embedding_matrix
# function for word transform
# to get the vector of all words in articles of batch_size
def trans2input_batch_first(batch_data, batch_size, sequence_length,embedding_dim, embedding_matrix):
    zero_vc = np.zeros(embedding_dim,dtype='float32')
    input_data = np.zeros((batch_size, sequence_length,embedding_dim))
    for i in range(batch_size):
        # print(i)
        for j in range(sequence_length):
            indx = batch_data[i,j]
            if indx != 0:
                input_data[i,j,:] = embedding_matrix[indx]
            else:
                input_data[i,j,:] = zero_vc
    return input_data
# to get the vector of all words in articles of batch_size
def trans2input_batch_second(batch_data, batch_size, sequence_length,embedding_dim, embedding_matrix):
    zero_vc = np.zeros(embedding_dim,dtype='float32')
    input_data = np.zeros((sequence_length,batch_size,embedding_dim))
    for i in range(batch_size):
        # print(i)
        for j in range(sequence_length):
            indx = batch_data[i,j]
            if indx != 0:
                input_data[j,i,:] = embedding_matrix[indx]
            else:
                input_data[j,i,:] = zero_vc
    return input_data

Using TensorFlow backend.


In [18]:
# the following codes are packaged as a py file that can be import in other python scripts. 
# this user own package is named as "test"
import torch
# define a function for model test with pre-trained word2vec
def test_with_w2v(test_set,test_labels,data_size,vocabulary_size,sequence_length,load_model,tokenizer,batch_size,embedding_dim,embedding_matrix,model,batch_first):
    # load data and labels
    articles,token = load_data(test_set,data_size)
    labels = load_labels(test_labels,data_size)
    # tokenize and transform to matrix
    X, tokenizer = tokenize(articles,vocabulary_size,sequence_length,load_model,tokenizer)
    all_weights = []
    all_true_label = []
    pred_labels = [] 
    correct = 0
    total = 0
    acc = 0
    # predict label for test data by given model
    for i in range(0,data_size,batch_size):# mini batch process
        if batch_first: 
            input_x = trans2input_batch_first(X[i:i+batch_size,:],batch_size,sequence_length,embedding_dim,embedding_matrix) # word embedding using google news vectors
        else:
            input_x = trans2input_batch_second(X[i:i+batch_size,:],batch_size,sequence_length,embedding_dim,embedding_matrix) # word embedding using google news vectors

        b_x = torch.from_numpy(input_x).float()   # reshape x to (batch, time_step, input_size)
    
        test_output,attn_weights = model(b_x)  # model output
        all_weights.extend(attn_weights)
        
        y_true = labels[i:i+batch_size]
        all_true_label.extend(y_true)
        
        pred_y = torch.max(test_output, 1)[1].data.numpy().squeeze() # to get the maximum probability of very article in the batch and get its value
        pred_labels.extend(pred_y.tolist())
        
        total += y_true.shape[0] # the total number of article in the test data
        correct += (pred_y == y_true).sum().item() # the number of intances that pred_y matches with the y_true
        
    acc = 100.00 * float(correct) / float(total) # get accuracy    
    return acc,output_dic,X,token,all_weights,all_true_label,pred_labels

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from time import time
import pickle
import json
torch.manual_seed(1)    # reproducible

# get data    
test_set = "test_article.txt"
test_label = "test_label.txt"
load_model = True
load_model_path = "./pytorch_lstmattn_2"
w2v_file = "GoogleNews-vectors-negative300.bin" 
w2v_bi = True

# use GPU 
cuda_gpu = False

batch_first = True
data_size = 300 # the number of articles in dataset
# Hyper Parameters
batch_size = 5  # Batch size
vocabulary_size = 40000 # The number of unique words is 
sequence_length = 436 # The number of words per article 
embedding_dim = 300  # Dimension of word embedding 
hidden_dim = 500 # The number of unit in a hidden layer
num_layers = 1 # The number of hidden layers 
dropout = 0.0  # The dropout rate
output_size = 2 # The output size 
lr = 0.001           # learning rate
# don't change, test always needs to load model
load_model = True
# built a lstm structure
class AttentionLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim,num_layers,output_size,dropout):
        super(AttentionLSTM,self).__init__() # don't forget to call this!
        
        self.encoder = nn.LSTM(input_size=embedding_dim, # assign size of each input data
                             hidden_size=hidden_dim,
                             num_layers = num_layers,
                           batch_first = True)
        self.dropout = nn.Dropout(dropout)
        self.fc1=nn.Linear(hidden_dim,output_size)
        self.hidden2out = nn.Linear(hidden_dim,output_size)
        self.softmax = nn.LogSoftmax()
    
    def attention_net(self, lstm_output, final_state):
        """ 
        Now we will incorporate Attention mechanism in our LSTM model. In this new model, we will use attention to compute soft alignment score corresponding
        between each of the hidden_state and the last hidden_state of the LSTM. We will be using torch.bmm for the batch matrix multiplication.
        """
        hidden = final_state.permute(1,2,0)
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return new_hidden_state,soft_attn_weights
  
    def forward(self, x):
        output, (final_hidden_state, final_cell_state) = self.encoder(x,None)
       
        attn_output,soft_attn_weights = self.attention_net(output, final_hidden_state)
       
        attn_output= self.dropout(attn_output)
        fc_output= self.fc1(attn_output)
        output = self.softmax(fc_output) # output layer using softmax function
        return output,soft_attn_weights

if cuda_gpu:
    lstmattn = AttentionLSTM(embedding_dim, hidden_dim,num_layers,output_size,dropout).cuda()
else:
    lstmattn = AttentionLSTM(embedding_dim, hidden_dim,num_layers,output_size,dropout)

lstmattn.load_state_dict(torch.load(load_model_path, map_location='cpu'))
with open('./output/lstmattn_tokenizer.pickle', 'rb') as handle:
	tokenizer = pickle.load(handle)
# using google news w2v as word embedding model
embedding_matrix = load_w2v(w2v_file, w2v_bi, vocabulary_size, embedding_dim,tokenizer) # use the google w2v vector as the embedding layer
# use test function in test package, get test accurcy, precision, recall and f1 score
acc,output_dic,X,token,all_weights,all_true_label,pred_labels = test_with_w2v(test_set,test_label,data_size,vocabulary_size,sequence_length,load_model,tokenizer,batch_size,embedding_dim,embedding_matrix,lstmattn,batch_first)
print("accuracy",acc)
# store results
with open('./output/test_output_lstmattn.json', 'w') as outfile2:
    json.dump(output_dic, outfile2)

100
200
300




accuracy 86.66666666666667


In [66]:
seq = np.ndarray.tolist(X[1:])[0]
word = []
for item in seq:
    if item == 0:
        word.append("{PAD}")
    else:
        word.append(index_word[item])
print(word)

['details', 'national', 'missing', 'unidentified', 'persons', 'system', 'hard', 'use', 'meanwhile', 'local', 'officials', 'often', 'strapped', 'funds', 'run', 'autopsies', 'dna', 'tests', 'faces', 'bureaucratic', 'hurdles', 'getting', 'dna', 'samples', 'family', 'members', 'back', 'home', 'countries', 'missing', 'missing', 'person', 'reports', 'filed', 'foreign', 'consulates', 'often', 'make', 'system', 'date', 'lab', 'identified', 'three', 'four', 'brooks', 'county', 'location', 'series', 'mass', 'graves', 'told', 'remains', 'people', 'recovered', 'talking', 'people', 'voice', 'alive', 'even', 'less', 'dead', 'baker', 'said', 'reel', 'took', 'idea', 'publisher', 'could', 'observer', 'fundraise', 'build', 'searchable', 'tool', 'would', 'allow', 'families', 'view', 'items', 'found', 'remains', 'make', 'connection', 'missing', 'loved', 'ones', 'rosary', 'among', 'belongings', 'held', 'baylor', 'university', 'laboratory', 'forensic', 'anthropologists', 'seeking', 'identify', 'migrants', '

In [72]:
#example of tokens
print(token)

['a', 'nixonian', 'end', 'to', 'the', '2016', 'sleaze', 'sweepstakes', '?', 'opinion', 'modal', 'trigger', 'hillary', 'clinton', 'as', 'the', 'presidential', 'campaigns', 'sink', 'to', 'the', 'challenge', 'of', 'demonstrating', 'that', 'there', 'is', 'no', 'such', 'thing', 'as', 'rock', 'bottom', ',', 'remember', 'this', ':', 'when', 'the', 'clintons', 'decamped', 'from', 'washington', 'in', 'january', '2001', ',', 'they', 'took', 'some', 'white', 'house', 'furnishings', 'that', 'were', 'public', 'property', '.', 'they', 'also', 'finished', 'accepting', 'more', 'than', '$', '190,000', 'in', 'gifts', ',', 'including', 'two', 'coffee', 'tables', 'and', 'two', 'chairs', ',', 'a', '$', '7,375', 'gratuity', 'from', 'denise', 'rich', ',', 'whose', 'fugitive', 'former', 'husband', 'had', 'been', 'pardoned', 'in', 'president', 'clinton', 'own', 'final', 'hours', '.', 'a', 'washington', 'post', 'editorial', '(', '“', 'count', 'the', 'spoons', '”', ')', 'identified', '“', 'the', 'clintons', "'",

In [14]:
# visulize attention weights on texts by html
index_word = {v: k for k, v in tokenizer.word_index.items()}
with open("visualization_manual.html", "w") as html_file:
    for i in range(len(X)):
        match_ind = 0
        html_file.write('<br><br><br>')
        label = all_true_label[i]
        pre = pred_labels[i]
        seq = np.ndarray.tolist(X[i:])[0]
        word = []
        for item in seq:
            if item == 0:
                word.append("{PAD}")
            else:
                word.append(index_word[item])
        
        weight = all_weights[i].data.numpy()
        weight = weight / weight.max()
        
        if label == 1:
            html_file.write('<p>Label is Hyperpartisan, Predict as %s</p>' % (pre))
            for w in token[i]:
                if w in word[match_ind:]:
                    match_ind = word[match_ind:].index(w)+match_ind
                    alpha = weight[match_ind]
                    html_file.write('<font style="background: rgba(255, 0, 0, %f)">%s</font>\n' % (alpha, w))
                else:
                    html_file.write('<font style="background: rgba(255, 0, 0, %f)">%s</font>\n' % (0.0, w))
                
        elif label == 0:
            html_file.write('<p>Label is Non-hyperpartisan, Predict as %s</p>' % (pre))
            for w in token[i]:
                if w in word[match_ind:]:
                    match_ind = word[match_ind:].index(w)+match_ind
                    alpha = weight[match_ind]
                    html_file.write('<font style="background: rgba(0, 0, 255, %f)">%s</font>\n' % (alpha, w))
                else:
                    html_file.write('<font style="background: rgba(0, 0, 255, %f)">%s</font>\n' % (0.0, w))
print('\nOpen visualization.html to checkout the attention coefficients visualization.')


Open visualization.html to checkout the attention coefficients visualization.
