In [1]:
import os
import time
import pickle
import numpy as np
from collections import Counter

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

%matplotlib inline

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
def rougeScores(genSummary, refSummary):
    genTotal, refTotal, intersection = 0., 0., 0.
    for token in list(set(list(refSummary.keys()) + list(genSummary.keys()) )):
        intersection += min(refSummary[token], genSummary[token])
        refTotal += refSummary[token]
        genTotal += genSummary[token]

    recall = intersection / refTotal if refTotal > 0. else 0.
    prec   = intersection / genTotal if genTotal > 0. else 0.
    f1 = (2. * recall * prec) / (recall + prec) if (recall + prec) > 0. else 0.
    
    return recall, prec, f1

In [19]:
def buildPredSummary(df, summary, sentence_emb, curr_pred_emb, action, select_index, sent_index):
    if action.select(1, select_index).tolist()[0] == 1:
        sum_len =  curr_pred_emb.size()[0]
        summary = summary + ' ' + df['sentence'][sent_index]
        curr_pred_emb = torch.cat([curr_pred_emb[1:sum_len, :], sentence_emb], 0)
        return curr_pred_emb, summary
    
    else:
        return curr_pred_emb, summary

In [26]:
#sdf = pd.read_csv('/home/francisco/GitHub/DQN-Event-Summarization/data/sif/train_000_0.csv')
#sdf = pd.read_csv('/Users/franciscojavierarceo/GitHub/DQN-Event-Summarization/data/sif/train_000_0.csv')
#true_summary = sdf['summary'][0]
#ts_tokenized = Counter(true_summary.split(" "))

res = pickle.load(open('/home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_0.pkl', 'rb'))
sdf = res['data']
true_summary = res['embeddings']['true_summary']
ts_tokenized = Counter(true_summary.split(" "))
nsentences = sdf.index.max()

# Initializing stuff
SKIP = 0
SELECT = 1

nepochs = 500
lrate = 0.01
momentum_rate = 0.8

outputdim = 2
batchsize = 1
sent_size = 1
sif_emb_d = 300
nhiddenlayers = 1
nhiddenunits = 200
predsummary_size = 2

rand_rate = 1.0
decay_rate = 0.05

criterion = nn.MSELoss()

lossf = {
    'loss': [], 
    'sent_idx': [], 
    'epoch': [], 
    'action': [], 
    'f1': [], 
    'recall': [],
    'precision': [],
    'optpred': [], 
    'rouge_delta':[]
}

In [108]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [109]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()

with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-1.0658, -0.7839, -1.6148],
        [-1.1096, -0.7908, -1.5287],
        [-1.1140, -0.8332, -1.4392],
        [-1.1054, -0.8327, -1.4522],
        [-1.1459, -0.7619, -1.5357]])


In [198]:
inputsv2 = torch.tensor(np.random.rand(6,), dtype=torch.float)
temp_lstm = nn.LSTM(6, 2)

o, h = temp_lstm(inputsv2.view(1, 1, -1))

o

tensor([[[0.0445, 0.2234]]], grad_fn=<CatBackward>)

In [230]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim), torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return embeds, tag_space, tag_scores

class LSTMTaggerv2(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTaggerv2, self).__init__()
        self.hidden_dim = hidden_dim
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hiddenOther = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim), torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        #embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(sentence.view(1, 1, -1), self.hidden)
        #lstm_out, self.hidden = self.lstm(sentence.view(len(sentence), 1, -1), self.hidden)
        #tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
modelv2 = LSTMTaggerv2(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))

modelv2(inputsv2)

tensor([[[0., 0., 0.]]], grad_fn=<LogSoftmaxBackward>)

In [231]:
inputs_len6 = torch.tensor([0, 1, 2, 3, 4, 5], dtype=torch.long)

In [232]:
model(inputs_len6)[0].shape

torch.Size([6, 6])

In [233]:
embeds_tmp, tag_space_tmp, tag_scores_tmp = model(inputs)

In [222]:
embeds_tmp.shape, tag_space_tmp.shape, tag_scores_tmp.shape

(torch.Size([5, 6]), torch.Size([5, 3]), torch.Size([5, 3]))

In [223]:
inputs

tensor([0, 1, 2, 3, 4])

In [234]:
# 5 records on the bottom
embeds_tmp, embeds_tmp.shape

(tensor([[-0.3333, -1.0985,  0.0903, -1.6879,  0.7927,  2.2558],
         [ 0.6505,  0.2082,  0.4561,  0.7229,  0.1207,  0.0498],
         [ 0.2246, -0.9083,  0.2103, -1.3785, -1.9157,  0.2116],
         [-0.9674,  1.5726,  0.7261,  1.0171,  0.8188,  0.5618],
         [ 0.7490,  0.9222, -0.2654, -1.6319, -0.9896, -0.2716]],
        grad_fn=<EmbeddingBackward>), torch.Size([5, 6]))

In [214]:
tag_space_tmp

tensor([[-0.1110, -0.3197,  0.2363],
        [-0.1899, -0.2857,  0.2853],
        [-0.2019, -0.2547,  0.3657],
        [-0.1311, -0.2109,  0.2809],
        [-0.1281, -0.2692,  0.2805]], grad_fn=<ThAddmmBackward>)

In [215]:
tag_scores_tmp

tensor([[-1.1715, -1.3802, -0.8242],
        [-1.2576, -1.3533, -0.7824],
        [-1.3117, -1.3645, -0.7442],
        [-1.2335, -1.3133, -0.8214],
        [-1.2158, -1.3569, -0.8072]], grad_fn=<LogSoftmaxBackward>)

In [204]:
temp = model(inputs)

In [225]:
temp[0].shape, temp[1].shape, temp[2].shape

(torch.Size([5, 6]), torch.Size([5, 3]), torch.Size([5, 3]))

In [226]:
class simpleLSTM(nn.Module):  # inheriting from nn.Module!
    # calls the init function of nn.Module.  Dont get confused by syntax, always do it in an nn.Module
    def __init__(self, input_size, outputsize, nunits, nlayers, sent_size, batch_size):        
        super(simpleLSTM, self).__init__()
        self.input_size = input_size
        self.nunits = nunits
        self.sent_size = sent_size
        self.batch_size = batch_size
        self.lstm0 = nn.LSTM(input_size, nunits)
        self.outputlayer = nn.Linear(nunits, outputsize)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (autograd.Variable(torch.zeros(1, self.batch_size, self.nunits)))

    def forward(self, sent_embed):
        lstm_out, self.hidden = self.lstm0(sent_embed.view(self.batch_size, self.input_size, -1), self.hidden)
        #summary_hl = self.linearlayer1(lstm_out0[-1].view(self.sent_size, -1))
        return lstm_out #self.outputlayer(summary_hl)
        #return self.outputlayer(summary_hl)

model = simpleLSTM(input_size=sif_emb_d, outputsize=2, nunits=nhiddenunits,
                      nlayers=nhiddenlayers, sent_size=(nsentences+1), batch_size=batchsize)

# swap to ADAM
optimizer = optim.Adam(model.parameters(), lr=lrate)

In [227]:
predsummary_size

2

In [228]:
autograd.Variable(sent_emb).shape

torch.Size([122, 300])

In [229]:
rouge_preds = model(autograd.Variable(sent_emb))

RuntimeError: input.size(-1) must be equal to input_size. Expected 300, got 122

In [42]:
class lstmRegressor(nn.Module):  # inheriting from nn.Module!
    # calls the init function of nn.Module.  Dont get confused by syntax, always do it in an nn.Module
    def __init__(self, input_size, outputsize, nunits, nlayers, sent_size, batch_size):        
        super(lstmRegressor, self).__init__()
        self.input_size = input_size
        self.nunits = nunits
        self.nlayers = nlayers
        self.sent_size = sent_size
        self.batch_size = batch_size
        #self.lstmlayer = nn.LSTM(nunits, nunits, num_layers=self.nlayers)
        self.lstm0 = nn.LSTM(input_size, nunits)
        self.linearlayer0 = nn.Linear(input_size, nunits)
        self.linearlayer1 = nn.Linear(nunits, nunits)
        self.dense1_bn = nn.BatchNorm1d(nunits * 3)
        self.outputlayer = nn.Linear(nunits * 2, outputsize)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, self.batch_size, self.nunits)))

    def forward(self, query_embed, sent_embed, summ_embed):
        query_hl = self.linearlayer0(query_embed)
        sentence_hl = self.linearlayer0(sent_embed)
        lstm_out, self.hidden = self.lstm0(sent_embed.view(sent_embed, self.batch_size, -1), self.hidden)
        lstm_out0, self.hidden = self.lstm0(summ_embed.view(self.sent_size, self.batch_size, -1), self.hidden)
        summary_hl = self.linearlayer1(lstm_out0[-1].view(self.sent_size, -1))
        # Concatenates the layers into (nunit * 3, 2)
        catlayer = torch.cat((query_hl, sentence_hl, summary_hl), 1)
        return self.outputlayer(catlayer)
        #return self.outputlayer(self.dense1_bn(catlayer))

model = lstmRegressor(input_size=sif_emb_d, outputsize=2, nunits=nhiddenunits,
                      nlayers=nhiddenlayers, sent_size=predsummary_size, batch_size=batchsize)

# swap to ADAM
optimizer = optim.Adam(model.parameters(), lr=lrate)

rouge_preds = model(
        autograd.Variable(query_emb),
        autograd.Variable(sent_emb),
        autograd.Variable(predsummary_emb)#.view(predsummary_size, batchsize, -1))
)

RuntimeError: Expected hidden[0] size (1, 1, 200), got (1, 200)

# Single Iteration

In [5]:
res['embeddings']['query_embeddings']
res['embeddings']['sentence_embeddings']

dict_keys(['query_embeddings', 'sentence_embeddings', 'true_summary'])

In [8]:
predsummary_size

2

In [20]:
f1_t0 = 0.
curr_summary = ''
predsummary_emb = torch.from_numpy(np.zeros((predsummary_size, sif_emb_d))).float()
sent_index = 0
#for sent_index in range(nsentences):        
model.zero_grad()

# The embeddings start on the 5th column (index 4)
sent_emb = torch.FloatTensor(
    #sdf[sdf.columns[4:]].values[sent_index, :].reshape(1, sif_emb_d)
    res['embeddings']['sentence_embeddings'].values.reshape(nsentences + 1, sif_emb_d)
)

query_emb = torch.FloatTensor(
    res['embeddings']['query_embeddings'].values.reshape(1, sif_emb_d)
)

train_ys = torch.from_numpy(np.asarray([0]).reshape(1, 1)).float()
action = torch.from_numpy(np.asarray([0, 0]).reshape(1, 2)).int()

In [21]:
rouge_preds = model(
        autograd.Variable(query_emb),
        autograd.Variable(sent_emb),
        autograd.Variable(predsummary_emb.view(predsummary_size, batchsize, -1))
)

RuntimeError: Expected hidden[0] size (1, 1, 300), got (2, 1, 300)

In [None]:
qMax, qIndx = rouge_preds.max(dim=1)

if np.random.uniform() < rand_rate and rand_rate > 0:
    # Randomly choosing either 0 or 1 some percent of the time
    qIndx = np.random.randint(0, 2, 1)[0]

action[:, qIndx.data[0]] = 1
action[:, abs(qIndx.data[0] - 1)] = 0

predQonActions = torch.masked_select(rouge_preds, autograd.Variable(action.byte()))
# building the summary and capturing the embedding
# without a history model doesn't make a lot of sense
# not clear what's happening...
# concatenate summary embedding to input or try separate joining layer like before
# Worth looking at rougue of each sentence...calculate f1 for each sentence to find out
# might help figure out what's going on...
predsummary_emb, curr_summary = buildPredSummary(
                                     sdf, 
                                     curr_summary, 
                                     sent_emb, 
                                     predsummary_emb, 
                                     action, 
                                     SELECT, 
                                     sent_index
                                )
recall, prec, f1 = rougeScores(ts_tokenized, Counter(curr_summary.split(" ")))



In [None]:
t0 = time.clock()
for epoch in range(nepochs):
    # reset embeddings and summary at the start of training
    f1_t0 = 0.
    curr_summary = ''
    predsummary_emb = torch.from_numpy(np.zeros((predsummary_size, sif_emb_d))).float()
    
    for sent_index in range(nsentences):        
        model.zero_grad()
        
        # The embeddings start on the 5th column (index 4)
        sent_emb = torch.FloatTensor(
            sdf[sdf.columns[4:]].values[sent_index, :].reshape(1, sif_emb_d)
        )

        train_ys = torch.from_numpy(np.asarray([0]).reshape(1, 1)).float()
        action = torch.from_numpy(np.asarray([0, 0]).reshape(1,2)).int()

        rouge_preds = model(
                autograd.Variable(sent_emb), 
                autograd.Variable(predsummary_emb.view(predsummary_size, batchsize, -1)) 
        )

        qMax, qIndx = rouge_preds.max(dim=1)

        if np.random.uniform() < rand_rate and rand_rate > 0:
            # Randomly choosing either 0 or 1 some percent of the time
            qIndx = np.random.randint(0, 2, 1)[0]

        action[:, qIndx.data[0]] = 1
        action[:, abs(qIndx.data[0] - 1)] = 0
        
        predQonActions = torch.masked_select(rouge_preds, autograd.Variable(action.byte()))
        # building the summary and capturing the embedding
        # without a history model doesn't make a lot of sense
        # not clear what's happening...
        # concatenate summary embedding to input or try separate joining layer like before
        # Worth looking at rougue of each sentence...calculate f1 for each sentence to find out
        # might help figure out what's going on...
        predsummary_emb, curr_summary = buildPredSummary(
                                             sdf, 
                                             curr_summary, 
                                             sent_emb, 
                                             predsummary_emb, 
                                             action, 
                                             SELECT, 
                                             sent_index
                                        )
        recall, prec, f1 = rougeScores(ts_tokenized, Counter(curr_summary.split(" ")))
        
        # Backward part
        rouge_delta = f1 - f1_t0
        train_ys[0] = rouge_delta
        loss = criterion(predQonActions, autograd.Variable(train_ys))
        
        loss.backward()
        optimizer.step()
        
        lossf['loss'].append(loss.data[0])
        lossf['sent_idx'].append(sent_index)
        lossf['epoch'].append(epoch)
        lossf['action'].append(qIndx.data[0])
        lossf['f1'].append(f1)
        lossf['recall'].append(recall)
        lossf['precision'].append(prec)
        lossf['optpred'].append(predQonActions.data[0])
        lossf['rouge_delta'].append(rouge_delta)

        # Storing last round
        f1_t0 = f1
        
    if rand_rate > 0:
        rand_rate -= decay_rate
        
    if (epoch % 100) == 0:
        print('epoch %i; f1 = %.3f' % (epoch, f1))

print("training complete...runtime = %.2f minutes" %  ( (time.clock() - t0)/(60. * 60)) ) 

# Making the performance data a dataframe
perf = pd.DataFrame(lossf)

In [None]:
perf[perf['sent_idx'] == perf['sent_idx'].max()].plot(
    x='epoch', y='loss', c='red',
    grid=True,
    figsize=(12, 6),
    title='Loss value across training'
)
plt.show()

In [None]:
perf[perf['sent_idx'] == perf['sent_idx'].max()].plot(
    x='epoch', y=['f1', 'precision', 'recall'],
    grid=True,
    figsize=(12, 6),
    ylim=[0, 1],
    title='f1-score across training'
)
plt.show()

In [None]:
lead3 = ' '.join(sdf['sentence'][0:3])

finalsummary = rougeScores(ts_tokenized, Counter(curr_summary.split(" ")))
baseline = rougeScores(ts_tokenized, Counter(lead3.split(" ")))

print("lead-3  recall = %.3f; precision = %.3f; f1-score = %.3f " % (baseline[0], baseline[1], baseline[2]))

print("learned recall = %.3f; precision = %.3f; f1-score = %.3f " % (finalsummary[0], finalsummary[1], finalsummary[2]))

In [None]:
lead3

In [None]:
true_summary

In [None]:
curr_summary

In [None]:
sdf.head()

# END