In [2]:
import numpy as np
import pandas as pd
import os

from rationale_3players_sentence_classification_models import ClassifierModule, HardRationale3PlayerClassificationModel
from rationale_3players_for_emnlp import HardRationale3PlayerClassificationModelForEmnlp

import torch
from transformers import *
from torch.utils import data
from torch.autograd import Variable

from collections import deque

from tqdm import tqdm

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


## Specify arguments for the model and data processing

In [5]:
# load the data
DATA_FOLDER = os.path.join("../../sentiment_dataset/data/")
LABEL_COL = "label"
TEXT_COL = "sentence"
TO_LOWER = True
MAX_LEN = 150
BATCH_SIZE_PRED = 512
TRAIN_SIZE = 0.6
batch_size = 150
TOKEN_CUTOFF = 75 - 18

class Argument():
    def __init__(self):
        self.model_type = 'RNN'
        self.cell_type = 'GRU'
        self.hidden_dim = 400
        self.embedding_dim = 768
        self.kernel_size = 5
        self.layer_num = 1
        self.fine_tuning = False
        self.z_dim = 2
        self.gumbel_temprature = 0.1
        self.cuda = True
        self.batch_size = 150
        self.mlp_hidden_dim = 50
        self.dropout_rate = 0.4
        self.use_relative_pos = True
        self.max_pos_num = 20
        self.pos_embedding_dim = -1
        self.fixed_classifier = True
        self.fixed_E_anti = True
        self.lambda_sparsity = 1.0
        self.lambda_continuity = 1.0
        self.lambda_anti = 1.0
        self.lambda_pos_reward = 0.1
        self.exploration_rate = 0.05
        self.highlight_percentage = 0.3
        self.highlight_count = 8
        self.count_tokens = 8
        self.count_pieces = 4
        self.lambda_acc_gap = 1.2
        self.label_embedding_dim = 400
        self.game_mode = '3player'
        self.margin = 0.2
#         self.lm_setting = 'single'
        self.lm_setting = 'multiple'
#         self.lambda_lm = 100.0
        self.lambda_lm = 1.0
        self.ngram = 4
        self.with_lm = False
        self.batch_size_ngram_eval = 5
        self.lr=0.001
        self.working_dir = '/dccstor/yum-dbqa/Rationale/structured_rationale/game_model_with_lm/beer_single_working_dir'
        self.model_prefix = 'tmp.%s.highlight%.2f.cont%.2f'%(self.game_mode, 
                                                                             self.highlight_percentage, 
                                                                             self.lambda_continuity)
        self.pre_trained_model_prefix = 'pre_trained_cls.model'

        self.save_path = os.path.join("..", "models")
        self.model_prefix = "sst2rnpmodel"
        self.save_best_model = True
        self.num_labels = 2
        
args = Argument()


# Embedding Layer

We want to use the pre-trained BERT embeddings, which generates embedded word vectors from word tokens.

#### Process for a single sentence
1.) generate_tokens() takes the BERT tokenizer and a sentence and tokenizes this text, to a limit of TOKEN_CUTOFF tokens. If the number of tokens is less than TOKEN_CUTOFF, it pads the tokens with the BERT pad symbol. It also provides a mask that can be used to ignore any pad tokens in classifier models down the road.<br>
2.) embedding_func() takes the tokens from generate_tokens and uses them to make a corresponding embedding.

#### Multiple sentences
get_all_tokens takes a pandas dataframe, and adds columns tokens and mask into that dataframe.

In [1]:
pretrained_weights = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
# Put the model in "evaluation" mode, meaning feed-forward operation.
if args.cuda:
    model.cuda()
model.eval()

def generate_tokens(tokenizer, text):
    tokenized_text = tokenizer.tokenize(text)
    tokenized_text = tokenized_text[:TOKEN_CUTOFF - 2]
    tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"]
    pad_length = TOKEN_CUTOFF - len(tokenized_text)
    mask = [1] * len(tokenized_text) + [0] * pad_length
    
    tokenized_text = tokenized_text + ["[PAD]"] * pad_length
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    return np.array(indexed_tokens), np.array(mask)
    
def embedding_func(tokens):
    ones_mask = Variable(torch.from_numpy(np.ones((len(tokens), TOKEN_CUTOFF))))
    if args.cuda:
        ones_mask = ones_mask.cuda()
    with torch.no_grad():
        embeddings = model(tokens, ones_mask)[0]
    return embeddings

def get_all_tokens(data):
    l = []
    m = []
    for sentence in data:
        token_list, mask = generate_tokens(tokenizer, sentence)
        l.append(token_list)
        m.append(mask)
    tokens = pd.DataFrame({"tokens": l, "mask": m})
    return tokens


NameError: name 'BertTokenizer' is not defined

In [38]:
glove_path = os.path.join("..", "datasets", "glove.6B.100d.txt")
CNT_THRESH = 10

def generate_tokens_glove(word_vocab, text):
    indexed_text = [word_vocab[word] if (counts[word] > CNT_THRESH) else word_vocab["<UNK>"] for word in text.split()]
    pad_length = TOKEN_CUTOFF - len(indexed_text)
    mask = [1] * len(indexed_text) + [0] * pad_length
    
    indexed_text = indexed_text + [word_vocab["<PAD>"]] * pad_length
    
    return np.array(indexed_text), np.array(mask)

def get_all_tokens_glove(data):
    l = []
    m = []
    for sentence in data:
        token_list, mask = generate_tokens_glove(word_vocab, sentence)
        l.append(token_list)
        m.append(mask)
    tokens = pd.DataFrame({"tokens": l, "mask": m})
    return tokens

def build_vocab(df):
    d = {"<PAD>":0, "<UNK>":1}
    counts = {}
    for i in range(len(df)):
        sentence = df.iloc[i][TEXT_COL]
        for word in sentence.split():
            if word not in d:
                d[word] = len(d)
                counts[word] = 1
            else:
                counts[word] += 1
    return d, counts

def initial_embedding(word_vocab, embedding_size, embedding_path=None): 
    vocab_size = len(word_vocab)
    # initialize a numpy embedding matrix 
    
    embeddings = 0.1*np.random.randn(vocab_size, embedding_size).astype(np.float32)
    
    # replace the <PAD> embedding by all zero
    embeddings[0, :] = np.zeros(embedding_size, dtype=np.float32)

    if embedding_path and os.path.isfile(embedding_path):
        f = open(embedding_path, "r", encoding="utf8")
        counter = 0
        for line in f:
            data = line.strip().split(" ")
            word = data[0].strip()
            embedding = data[1::]
            embedding = list(map(np.float32, embedding))
            if word in word_vocab:
                embeddings[word_vocab[word], :] = embedding
                counter += 1
        f.close()
        print("%d words has been switched."%counter)
    else:
        print("embedding is initialized fully randomly.")

    return embeddings

def load_data(fpath):
    df_dict = {LABEL_COL: [], TEXT_COL: []}
    with open(fpath, 'r') as f:
        label_start = 0
        sentence_start = 2
        for line in f:
            label = int(line[label_start])
            sentence = line[sentence_start:]
            df_dict[LABEL_COL].append(label)
            df_dict[TEXT_COL].append(sentence)
    return pd.DataFrame.from_dict(df_dict)


df_train = load_data(os.path.join(DATA_FOLDER, 'stsa.binary.test'))

# word_vocab, counts = build_vocab(df_test)
# embeddings = initial_embedding(word_vocab, 100, glove_path)


# df_test = load_data(os.path.join(DATA_FOLDER, 'stsa.binary.test'))

# create training and testing labels
y_train = df_train[LABEL_COL]
# y_test = df_test[LABEL_COL]

# create training and testing inputs
X_train = df_train[TEXT_COL]
# X_test = df_test[TEXT_COL]

df_train = pd.concat([df_train, get_all_tokens_glove(X_train)], axis=1)
# df_test = pd.concat([df_test, get_all_tokens(X_test)], axis=1)



In [43]:
df_train

Unnamed: 0,label,sentence,tokens,mask
0,0,"no movement , no yuks , not much of anything .\n","[2, 1, 4, 2, 1, 4, 6, 7, 8, 9, 10, 0, 0, 0, 0,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."
1,0,"a gob of drivel so sickly sweet , even the eag...","[11, 1, 8, 1, 14, 1, 1, 4, 17, 18, 1, 1, 8, 1,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,0,"gangs of new york is an unapologetic mess , wh...","[1, 8, 34, 1, 36, 37, 1, 39, 4, 40, 41, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,0,"we never really feel involved with the story ,...","[51, 52, 53, 54, 1, 56, 18, 57, 4, 58, 59, 8, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1,this is one of polanski 's best films .\n,"[65, 36, 66, 8, 1, 22, 68, 69, 10, 0, 0, 0, 0,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
1816,0,"an often-deadly boring , strange reading of a ...","[37, 1, 790, 4, 1, 1, 8, 11, 1, 40, 1, 2532, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1817,0,the problem with concept films is that if the ...,"[18, 1, 56, 1, 69, 36, 44, 120, 18, 1, 36, 11,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1818,0,"safe conduct , however ambitious and well-inte...","[1, 1, 4, 1, 1, 104, 1, 4, 1, 111, 1, 18, 1461...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1819,0,"a film made with as little wit , interest , an...","[11, 163, 1166, 56, 58, 109, 1, 4, 1, 4, 104, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Load the Data

We read the data from files that already have it split between test and train sets.


In [9]:
def load_data(fpath):
    df_dict = {LABEL_COL: [], TEXT_COL: []}
    with open(fpath, 'r') as f:
        label_start = 0
        sentence_start = 2
        for line in f:
            label = int(line[label_start])
            sentence = line[sentence_start:]
            df_dict[LABEL_COL].append(label)
            df_dict[TEXT_COL].append(sentence)
    return pd.DataFrame.from_dict(df_dict)

df_train = load_data(os.path.join(DATA_FOLDER, 'stsa.binary.test'))
df_test = load_data(os.path.join(DATA_FOLDER, 'stsa.binary.test'))

# create training and testing labels
y_train = df_train[LABEL_COL]
y_test = df_test[LABEL_COL]

# create training and testing inputs
X_train = df_train[TEXT_COL]
X_test = df_test[TEXT_COL]

df_train = pd.concat([df_train, get_all_tokens(X_train)], axis=1)
df_test = pd.concat([df_test, get_all_tokens(X_test)], axis=1)

In [53]:
df_train["mask"]

0       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...
1       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
2       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
3       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...
                              ...                        
1816    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1817    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1818    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1819    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1820    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
Name: mask, Length: 1821, dtype: object

## Set up the model

In [6]:
args = Argument()

classification_model = HardRationale3PlayerClassificationModelForEmnlp(embedding_func, args)

if args.cuda:
    classification_model.cuda()

classification_model.init_optimizers()
classification_model.init_C_model()

args.fixed_E_anti = False
classification_model.fixed_E_anti = args.fixed_E_anti
args.with_lm = False
args.lambda_lm = 1.0

train_losses = []
train_accs = []
dev_accs = [0.0]
dev_anti_accs = [0.0]
dev_cls_accs = [0.0]
test_accs = [0.0]
test_anti_accs = [0.0]
test_cls_accs = [0.0]
best_dev_acc = 0.0
best_test_acc = 0.0
num_iteration = 100
display_iteration = 1
test_iteration = 1

eval_accs = [0.0]
eval_anti_accs = [0.0]

queue_length = 200
z_history_rewards = deque(maxlen=queue_length)
z_history_rewards.append(0.)

classification_model.init_optimizers()
classification_model.init_rl_optimizers()
classification_model.init_reward_queue()

old_E_anti_weights = classification_model.E_anti_model.predictor._parameters['weight'][0].cpu().data.numpy()



### Utilization Functions

In [7]:
def generate_data(batch):
    x_mat = np.stack(batch["tokens"], axis=0)
    x_mask = np.stack(batch["mask"], axis=0)
    y_vec = np.stack(batch["label"], axis=0)
    
    batch_x_ = Variable(torch.from_numpy(x_mat)).to(torch.int64)
    batch_m_ = Variable(torch.from_numpy(x_mask)).type(torch.FloatTensor)
    batch_y_ = Variable(torch.from_numpy(y_vec)).to(torch.int64)

    if args.cuda:
        batch_x_ = batch_x_.cuda()
        batch_m_ = batch_m_.cuda()
        batch_y_ = batch_y_.cuda()

    return batch_x_, batch_m_, batch_y_

def _get_sparsity(z, mask):
    mask_z = z * mask
    seq_lengths = torch.sum(mask, dim=1)

    sparsity_ratio = torch.sum(mask_z, dim=-1) / seq_lengths #(batch_size,)
#     sparsity_count = torch.sum(mask_z, dim=-1)

    return sparsity_ratio

def _get_continuity(z, mask):
    mask_z = z * mask
    seq_lengths = torch.sum(mask, dim=1)
    
    mask_z_ = torch.cat([mask_z[:, 1:], mask_z[:, -1:]], dim=-1)
        
    continuity_ratio = torch.sum(torch.abs(mask_z - mask_z_), dim=-1) / seq_lengths #(batch_size,) 
#     continuity_count = torch.sum(torch.abs(mask_z - mask_z_), dim=-1)
    
    return continuity_ratio

def display_example(x, m, z):
    seq_len = int(m.sum().item())
    ids = x[:seq_len]
    tokens = tokenizer.convert_ids_to_tokens(ids)
    
    final = ""
    for i in range(1, len(tokens) - 1):
        if z[i]:
            final += "[" + tokens[i] + "]"
        else:
            final += tokens[i]
        final += " "
    print(final)

def test():
    classification_model.eval()
    
    test_size = 200
    
    test_batch = df_test.sample(test_size)
    batch_x_, batch_m_, batch_y_ = generate_data(test_batch)
    predict, anti_predict, z, neg_log_probs = classification_model(batch_x_, batch_m_)
    
    # do a softmax on the predicted class probabilities
    _, y_pred = torch.max(predict, dim=1)
    _, anti_y_pred = torch.max(anti_predict, dim=1)
    
    # calculate sparsity
    print("Test sparsity: ", _get_sparsity(z, batch_m_).sum().item() / batch_size)
    
    accuracy = (y_pred == batch_y_).sum().item() / test_size
    anti_accuracy = (anti_y_pred == batch_y_).sum().item() / test_size
    train_accs.append((accuracy, anti_accuracy))
    print("Test accuracy: ", accuracy, "% Anti-accuracy: ", anti_accuracy)

    # display an example
    print("Gold Label: ", batch_y_[0].item(), " Pred label: ", y_pred[0].item())
    display_example(batch_x_[0], batch_m_[0], z[0])

## Train

In [8]:
test_freq = 50
save_freq = 999

for iteration in range(10000):
    classification_model.train()

    # sample a batch of data
    batch = df_train.sample(batch_size, replace=True)
    batch_x_, batch_m_, batch_y_ = generate_data(batch)

    losses, predict = classification_model.train_cls_one_step(batch_x_, batch_y_, batch_m_)

    # calculate classification accuarcy
    _, y_pred = torch.max(predict, dim=1)

    acc = np.float((y_pred == batch_y_).sum().cpu().data.item()) / args.batch_size
    train_accs.append(acc)
    
    if iteration % save_freq == 0:
        torch.save(classification_model.state_dict(), "iteration_" + str(iteration) + ".pth")
    if iteration % test_freq == 0:
        test()

Test sparsity:  0.3383092498779297
Test accuracy:  0.45 % Anti-accuracy:  0.525
Gold Label:  0  Pred label:  0
it does n ' t do the original [any] particular dish ##ono ##r , but neither does it ex [##ude] [any] [charm] [or] [personality] . 


RuntimeError: The size of tensor a (48) must match the size of tensor b (57) at non-singleton dimension 2

In [14]:
df_test.iloc[0]['mask']
import torch.nn as nn


In [153]:
class RnnModel(nn.Module):

    def __init__(self, args, input_dim):
        """
        args.hidden_dim -- dimension of filters
        args.embedding_dim -- dimension of word embeddings
        args.layer_num -- number of RNN layers   
        args.cell_type -- type of RNN cells, GRU or LSTM
        """
        super(RnnModel, self).__init__()
        
        self.args = args
 
        if args.cell_type == 'GRU':
            self.rnn_layer = nn.GRU(input_size=input_dim, 
                                    hidden_size=args.hidden_dim//2, 
                                    num_layers=args.layer_num, bidirectional=True)
        elif args.cell_type == 'LSTM':
            self.rnn_layer = nn.LSTM(input_size=input_dim, 
                                     hidden_size=args.hidden_dim//2, 
                                     num_layers=args.layer_num, bidirectional=True)
    
    def forward(self, embeddings, mask=None):
        """
        Inputs:
            embeddings -- sequence of word embeddings, (batch_size, sequence_length, embedding_dim)
            mask -- a float tensor of masks, (batch_size, length)
        Outputs:
            hiddens -- sentence embedding tensor, (batch_size, hidden_dim, sequence_length)
        """
        embeddings_ = embeddings.transpose(0, 1) #(sequence_length, batch_size, embedding_dim)
        
        if mask is not None: #TODO change
            seq_lengths = list(torch.sum(mask, dim=1).cpu().data.numpy())
            seq_lengths = list(map(int, seq_lengths))
            inputs_ = torch.nn.utils.rnn.pack_padded_sequence(embeddings_, seq_lengths, enforce_sorted=False)
        else:
            inputs_ = embeddings_
        
        hidden, _ = self.rnn_layer(inputs_) #(sequence_length, batch_size, hidden_dim (* 2 if bidirectional))
        
        if mask is not None: #TODO change
            hidden, _ = torch.nn.utils.rnn.pad_packed_sequence(hidden) #(length, batch_size, hidden_dim)
        
        return hidden.permute(1, 2, 0) #(batch_size, hidden_dim, sequence_length)

class ClassifierModule(nn.Module):
    '''
    classifier for both E and E_anti models
    '''
    def __init__(self, args):
        super(ClassifierModule, self).__init__()
        self.args = args
        
        self.num_labels = args.num_labels
        self.hidden_dim = args.hidden_dim
        self.mlp_hidden_dim = args.mlp_hidden_dim #50
        
        self.input_dim = args.embedding_dim
        
        self.encoder = RnnModel(self.args, self.input_dim)
        self.predictor = nn.Linear(self.hidden_dim, self.num_labels)
        
        self.NEG_INF = -1.0e6
        

    def forward(self, word_embeddings, z, mask):
        """
        Inputs:
            word_embeddings -- torch Variable in shape of (batch_size, length, embed_dim)
            z -- rationale (batch_size, length)
            mask -- torch Variable in shape of (batch_size, length)
        Outputs:
            predict -- (batch_size, num_label)
        """        

        masked_input = word_embeddings * z.unsqueeze(-1)
        hiddens = self.encoder(masked_input, mask)
        
        max_hidden = torch.max(hiddens + (1 - mask * z).unsqueeze(1) * self.NEG_INF, dim=2)[0]
        
        predict = self.predictor(max_hidden)

        return predict
    
class Generator(nn.Module):
    
    def __init__(self, args, input_dim):
        """        
        args.z_dim -- rationale or not, always 2
        args.model_type -- "CNN" or "RNN"

        if CNN:
            args.hidden_dim -- dimension of filters
            args.embedding_dim -- dimension of word embeddings
            args.kernel_size -- kernel size of the conv1d
            args.layer_num -- number of CNN layers        
        if use RNN:
            args.hidden_dim -- dimension of filters
            args.embedding_dim -- dimension of word embeddings
            args.layer_num -- number of RNN layers   
            args.cell_type -- type of RNN cells, "GRU" or "LSTM"
        """
        super(Generator, self).__init__()
        
        self.args = args
        self.z_dim = args.z_dim
        
        if args.model_type == "CNN":
            self.generator_model = CnnModel(args, input_dim)
        elif args.model_type == "RNN":
            self.generator_model = RnnModel(args, input_dim)
        self.output_layer = nn.Linear(args.hidden_dim, self.z_dim)
        
        
    def forward(self, x, mask=None):
        """
        Given input x in shape of (batch_size, sequence_length) generate a 
        "binary" mask as the rationale
        Inputs:
            x -- input sequence of word embeddings, (batch_size, sequence_length, embedding_dim)
        Outputs:
            z -- output rationale, "binary" mask, (batch_size, sequence_length)
        """
        
        #(batch_size, sequence_length, hidden_dim)
        hiddens = self.generator_model(x, mask).transpose(1, 2).contiguous() 
        scores = self.output_layer(hiddens) # (batch_size, sequence_length, 2)

        return scores

import torch.nn.functional as F
    
rnn = RnnModel(args, 768)
rnn.cuda()

gen = Generator(args, 768)
gen.cuda()

cls = ClassifierModule(args)
cls.cuda()

ClassifierModule(
  (encoder): RnnModel(
    (rnn_layer): GRU(768, 200, bidirectional=True)
  )
  (predictor): Linear(in_features=400, out_features=2, bias=True)
)

In [137]:
test_batch = df_test.iloc[0:1]
batch_x_, batch_m_, batch_y_ = generate_data(test_batch)
embeds = embedding_func(batch_x_)
embeds.cuda()

z = Variable(torch.from_numpy(np.array([[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1,  1]]))).to(torch.int64)
z.cuda()

hiddens = rnn(embeds, batch_m_)

In [97]:
max_hidden = torch.max(hiddens + (1 - batch_m_ * z.cuda()).unsqueeze(1).cuda() * 1e-6, dim=2)[0]

RuntimeError: The size of tensor a (14) must match the size of tensor b (57) at non-singleton dimension 2

In [110]:
torch.max((1 - batch_m_ * z.cuda()).unsqueeze(1).cuda() * 1e-6, dim=2)

torch.return_types.max(
values=tensor([[1.0000e-06]], device='cuda:0'),
indices=tensor([[16]], device='cuda:0'))

In [135]:
embeds * z.unsqueeze(-1).cuda()

RuntimeError: expected device cuda:0 but got device cpu

In [158]:
def _generate_rationales(z_prob_):
    '''
    Input:
        z_prob_ -- (num_rows, length, 2)
    Output:
        z -- (num_rows, length)
    '''        
    z_prob__ = z_prob_.view(-1, 2) # (num_rows * length, 2)

    # sample actions
    sampler = torch.distributions.Categorical(z_prob__)
    if True:
        z_ = sampler.sample() # (num_rows * p_length,)
    else:
        z_ = torch.max(z_prob__, dim=-1)[1]

    #(num_rows, length)
    z = z_.view(z_prob_.size(0), z_prob_.size(1))

    if True == True:
        z = z.type(torch.cuda.FloatTensor)
    else:
        z = z.type(torch.FloatTensor)

    # (num_rows * length,)
    neg_log_probs_ = -sampler.log_prob(z_)
    # (num_rows, length)
    neg_log_probs = neg_log_probs_.view(z_prob_.size(0), z_prob_.size(1))

    return z, neg_log_probs

z_scores_ = gen(embeds, batch_m_)
z_scores_[:, :, 1] = z_scores_[:, :, 1] + (1 - batch_m_).cuda() * -1e6

z_probs_ = F.softmax(z_scores_, dim=-1)

z_probs_temp_ = (batch_m_.unsqueeze(-1).cuda() * ( ((1 - .005) * z_probs_ + .005).cuda() / z_probs_.size(-1) ) )
z_probs_ = z_probs_ + ((1 - batch_m_.unsqueeze(-1)).cuda() * z_probs_temp_)

z, neg_log_probs = _generate_rationales(z_probs_)

In [159]:
z

tensor([[0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1.]],
       device='cuda:0')

In [160]:
cls(embeds, z, batch_m_)

RuntimeError: The size of tensor a (57) must match the size of tensor b (14) at non-singleton dimension 1

In [150]:
batch_m_ = Variable(torch.from_numpy(np.array([[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1,  1]]))).to(torch.int64)

In [161]:
z.shape, batch_m_.shape

(torch.Size([1, 14]), torch.Size([1, 14]))

In [162]:
embeds.shape

torch.Size([1, 57, 768])