In [16]:
import sys
import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from tqdm import tqdm
from collections import deque
import torch.optim as optim

In [31]:
def generate_data(batch):
    # sort for rnn happiness
    batch.sort_values("counts", inplace=True, ascending=False)
    
    x_mask = np.stack(batch["mask"], axis=0)
    # drop all zero columns
    zero_col_idxs = np.argwhere(np.all(x_mask[...,:] == 0, axis=0))
    x_mask = np.delete(x_mask, zero_col_idxs, axis=1)

    x_mat = np.stack(batch["tokens"], axis=0)
    # drop all zero columns
    x_mat = np.delete(x_mat, zero_col_idxs, axis=1)

    y_vec = np.stack(batch["label"], axis=0)
    
    batch_x_ = Variable(torch.from_numpy(x_mat)).to(torch.int64)
    batch_m_ = Variable(torch.from_numpy(x_mask)).type(torch.FloatTensor)
    batch_y_ = Variable(torch.from_numpy(y_vec)).to(torch.int64)

    if args.cuda:
        batch_x_ = batch_x_.cuda()
        batch_m_ = batch_m_.cuda()
        batch_y_ = batch_y_.cuda()

    return batch_x_, batch_m_, batch_y_
    
glove_path = os.path.join("..", "datasets", "glove.6B.100d.txt")
COUNT_THRESH = 3
DATA_FOLDER = os.path.join("../../sentiment_dataset/data/")
LABEL_COL = "label"
TEXT_COL = "sentence"
TOKEN_CUTOFF = 70

def generate_tokens_glove(word_vocab, text):
    indexed_text = [word_vocab[word] if (counts[word] > COUNT_THRESH) else word_vocab["<UNK>"] for word in text.split()]
    pad_length = TOKEN_CUTOFF - len(indexed_text)
    mask = [1] * len(indexed_text) + [0] * pad_length

    indexed_text = indexed_text + [word_vocab["<PAD>"]] * pad_length

    return np.array(indexed_text), np.array(mask)

def get_all_tokens_glove(data):
    l = []
    m = []
    counts = []
    for sentence in data:
        token_list, mask = generate_tokens_glove(word_vocab, sentence)
        l.append(token_list)
        m.append(mask)
        counts.append(np.sum(mask))
    tokens = pd.DataFrame({"tokens": l, "mask": m, "counts": counts})
    return tokens

def build_vocab(df):
    d = {"<PAD>":0, "<UNK>":1}
    counts = {}
    for i in range(len(df)):
        sentence = df.iloc[i][TEXT_COL]
        for word in sentence.split():
            if word not in d:
                d[word] = len(d)
                counts[word] = 1
            else:
                counts[word] += 1
    reverse_d = {v: k for k, v in d.items()}
    return d, reverse_d, counts

def initial_embedding(word_vocab, embedding_size, embedding_path=None): 
    vocab_size = len(word_vocab)
    # initialize a numpy embedding matrix 

    embeddings = 0.1*np.random.randn(vocab_size, embedding_size).astype(np.float32)

    # replace the <PAD> embedding by all zero
    embeddings[0, :] = np.zeros(embedding_size, dtype=np.float32)

    if embedding_path and os.path.isfile(embedding_path):
        f = open(embedding_path, "r", encoding="utf8")
        counter = 0
        for line in f:
            data = line.strip().split(" ")
            word = data[0].strip()
            embedding = data[1::]
            embedding = list(map(np.float32, embedding))
            if word in word_vocab:
                embeddings[word_vocab[word], :] = embedding
                counter += 1
        f.close()
        print("%d words has been switched."%counter)
    else:
        print("embedding is initialized fully randomly.")

    return embeddings

def load_data(fpath):
    df_dict = {LABEL_COL: [], TEXT_COL: []}
    with open(fpath, 'r') as f:
        label_start = 0
        sentence_start = 2
        for line in f:
            label = int(line[label_start])
            sentence = line[sentence_start:]
            df_dict[LABEL_COL].append(label)
            df_dict[TEXT_COL].append(sentence)
    return pd.DataFrame.from_dict(df_dict)


df_train = load_data(os.path.join(DATA_FOLDER, 'stsa.binary.train'))
df_test = load_data(os.path.join(DATA_FOLDER, 'stsa.binary.test'))
# TODO combine train and test dataset into df_all
df_all = pd.concat([df_train, df_test])

word_vocab, reverse_word_vocab, counts = build_vocab(df_all)
embeddings = initial_embedding(word_vocab, 100, glove_path)

# create training and testing labels
y_train = df_train[LABEL_COL]
y_test = df_test[LABEL_COL]

# create training and testing inputs
X_train = df_train[TEXT_COL]
X_test = df_test[TEXT_COL]

df_train = pd.concat([df_train, get_all_tokens_glove(X_train)], axis=1)
df_test = pd.concat([df_test, get_all_tokens_glove(X_test)], axis=1)


15403 words has been switched.


In [53]:
class Argument():
    def __init__(self):
        self.cell_type = 'GRU'
        self.embedding_dim = 100
        self.num_labels = 2
        self.hidden_dim = 400
        self.layer_num = 1
        self.cuda = True

# classes needed for Rationale3Player
class RnnModel(nn.Module):
    def __init__(self, args, input_dim):
        """
        args.hidden_dim -- dimension of filters
        args.embedding_dim -- dimension of word embeddings
        args.layer_num -- number of RNN layers   
        args.cell_type -- type of RNN cells, GRU or LSTM
        """
        super(RnnModel, self).__init__()
        
        self.args = args
 
        if args.cell_type == 'GRU':
            self.rnn_layer = nn.GRU(input_size=input_dim, 
                                    hidden_size=args.hidden_dim//2, 
                                    num_layers=args.layer_num, bidirectional=True)
        elif args.cell_type == 'LSTM':
            self.rnn_layer = nn.LSTM(input_size=input_dim, 
                                     hidden_size=args.hidden_dim//2, 
                                     num_layers=args.layer_num, bidirectional=True)
    
    def forward(self, embeddings, mask=None):
        """
        Inputs:
            embeddings -- sequence of word embeddings, (batch_size, sequence_length, embedding_dim)
            mask -- a float tensor of masks, (batch_size, length)
        Outputs:
            hiddens -- sentence embedding tensor, (batch_size, hidden_dim, sequence_length)
        """
        embeddings_ = embeddings.transpose(0, 1) #(sequence_length, batch_size, embedding_dim)
        
        if mask is not None:
            seq_lengths = list(torch.sum(mask, dim=1).cpu().data.numpy())
            seq_lengths = list(map(int, seq_lengths))
            inputs_ = torch.nn.utils.rnn.pack_padded_sequence(embeddings_, seq_lengths)
        else:
            inputs_ = embeddings_
        
        hidden, _ = self.rnn_layer(inputs_) #(sequence_length, batch_size, hidden_dim (* 2 if bidirectional))
        
        if mask is not None:
            hidden, _ = torch.nn.utils.rnn.pad_packed_sequence(hidden) #(length, batch_size, hidden_dim)
        
        return hidden.permute(1, 2, 0) #(batch_size, hidden_dim, sequence_length)

class ClassifierModule(nn.Module):
    '''
    classifier for both E and E_anti models provided with RNP paper code
    '''
    def __init__(self, embeddings, args):
        super(ClassifierModule, self).__init__()
        self.args = args
        self.num_labels = args.num_labels
        self.hidden_dim = args.hidden_dim
        self.mlp_hidden_dim = args.mlp_hidden_dim #50
        self.input_dim = args.embedding_dim
        
        self.encoder = RnnModel(self.args, self.input_dim)
        self.predictor = nn.Linear(self.hidden_dim, self.num_labels)
        
        self.NEG_INF = -1.0e6

        self.vocab_size, self.embedding_dim = embeddings.shape
        self.embed_layer = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.embed_layer.weight.data = torch.from_numpy(embeddings)
        self.embed_layer.weight.requires_grad = True #TODO try false? 

    def forward(self, x, z, mask):
        """
        Inputs:
            word_embeddings -- torch Variable in shape of (batch_size, length, embed_dim)
            z -- rationale (batch_size, length)
            mask -- torch Variable in shape of (batch_size, length)
        Outputs:
            predict -- (batch_size, num_label)
        """        
        word_embeddings = self.embed_layer(x) #(batch_size, length, embedding_dim)

        masked_input = word_embeddings * z.unsqueeze(-1)
        hiddens = self.encoder(masked_input, mask)
        
        max_hidden = torch.max(hiddens + (1 - mask * z).unsqueeze(1) * self.NEG_INF, dim=2)[0]
        
        predict = self.predictor(max_hidden)
        return predict
    
    def test(self, df_test):
        self.eval()
        batch_size = 1000
        batch = df_test.sample(batch_size, replace=True)
        batch_x_, batch_m_, batch_y_ = generate_data(batch)
        # get the inputs; data is a list of [inputs, labels]
        z = torch.ones_like(batch_x_).type(torch.cuda.FloatTensor)
        _, predict =  torch.max(self.forward(batch_x_, z, batch_m_), dim=1)
        accuracy = (predict == batch_y_).sum().item() / batch_size

        return accuracy
    
    def fit(self, df_train, df_test):
        train_acc = []
        test_acc = []
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(self.parameters(), lr=0.001, momentum=0.9) #experiment with learning rate
        
        for i in range(5000):
            self.train()
            batch = df_train.sample(40, replace=True)
            batch_x_, batch_m_, batch_y_ = generate_data(batch)
            # get the inputs; data is a list of [inputs, labels]

            z = torch.ones_like(batch_x_).type(torch.cuda.FloatTensor)
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = self.forward(batch_x_, z, batch_m_)
            loss = criterion(outputs, batch_y_)
            loss.backward()
            optimizer.step()

            # print statistics
            if i % 100 == 0:
                print("")
                print("Iteration ", i)
                test_accuracy = self.test(df_test)
                train_accuracy = self.test(df_train)
                
                print("Test accuracy: ", test_accuracy)
                print("Train accuracy: ", train_accuracy)
                test_acc.append(test_accuracy)
                train_acc.append(train_accuracy)

        print('Finished Training')
        return test_acc, train_acc

In [54]:
args = Argument()
cls = ClassifierModule(embeddings, args)
cls.cuda()
test_acc, train_acc = cls.fit(df_train, df_test)


Iteration  0
Accuracy:  0.504

Iteration  100
Accuracy:  0.511

Iteration  200
Accuracy:  0.61

Iteration  300
Accuracy:  0.604

Iteration  400
Accuracy:  0.582

Iteration  500
Accuracy:  0.634

Iteration  600
Accuracy:  0.649

Iteration  700
Accuracy:  0.619

Iteration  800
Accuracy:  0.695

Iteration  900
Accuracy:  0.726

Iteration  1000
Accuracy:  0.604

Iteration  1100
Accuracy:  0.706

Iteration  1200
Accuracy:  0.702

Iteration  1300
Accuracy:  0.715

Iteration  1400
Accuracy:  0.699

Iteration  1500
Accuracy:  0.741

Iteration  1600
Accuracy:  0.738

Iteration  1700
Accuracy:  0.755

Iteration  1800
Accuracy:  0.767

Iteration  1900
Accuracy:  0.762

Iteration  2000
Accuracy:  0.775

Iteration  2100
Accuracy:  0.784

Iteration  2200
Accuracy:  0.774

Iteration  2300
Accuracy:  0.769

Iteration  2400
Accuracy:  0.774

Iteration  2500
Accuracy:  0.809

Iteration  2600
Accuracy:  0.779

Iteration  2700
Accuracy:  0.783

Iteration  2800
Accuracy:  0.781

Iteration  2900
Accuracy: 