In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [6]:
import csv
from csv import reader
from sklearn.model_selection import train_test_split

# Load the Albanian summarization dataset
#dataset = load_dataset('csv', data_files={'train': 'train.csv', 'validation': 'validation.csv'})

allNews = []
with open('/content/final.csv', 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Iterate over each row in the csv using reader object
    for row in csv_reader:
        # row variable is a list that represents a row in csv
        allNews.append(row)

allNews.pop(0)
articleDataset = []
summaryDataset = []
for news in allNews:
  articleDataset.append(news[4])
  summaryDataset.append(news[2] + news[3])

# Split the data into training and test sets
train_article, test_article = train_test_split(articleDataset, test_size=0.2, random_state=42)
train_summary, test_summary = train_test_split(summaryDataset, test_size=0.2, random_state=42)
train_article, validation_article, train_summary, validation_summary = train_test_split(train_article, train_summary, test_size=0.2, random_state=42)


UTILS

In [7]:
import tensorflow as tf
from tensorflow import keras

class Vocab(object):
    """Class for storing the mapping between words and their corresponding index in the vocabulary"""
    def __init__(self,tokenizer,max_size):
        self._word_to_id = {}
        self._id_to_word = {}
        self._count = 0 # Total number of words in the Vocab
        self._word_to_id['<PAD>'] = self._count
        self._id_to_word[self._count] = '<PAD>'
        self._count += 1
        for _, word in tokenizer.index_word.items():
            self._word_to_id[word] = self._count
            self._id_to_word[self._count] = word
            self._count += 1
            if self._count >= max_size:
                break

    def word2id(self, word):
        """Returns the id (integer) of a word (string). Returns [UNK] id if word is OOV."""
        if word not in self._word_to_id:
            return self._word_to_id['<UNK>']
        return self._word_to_id[word]

    def id2word(self, word_id):
        """Returns the word (string) corresponding to an id (integer)."""
        if word_id not in self._id_to_word:
            raise ValueError('Id not found in vocab: %d' % word_id)
        return self._id_to_word[word_id]

    def decode_seq(self,seq):
        return " ".join([self._id_to_word[idx] for idx in seq])

    def size(self):
        """Returns the total size of the vocabulary"""
        return self._count

def text2seq(text,tokenizer,vocab):
    """Convert a string or list of strings to a sequence of vocabulary ids

    Args:
    text(string or list of strings): text input
    tokenizer(object): tokenizer object
    vocab(object): vocabulary object

    Returns:
    seqs_padded(int tensor): sequence of vocabulary ids padded with start and end token ids
    """
    seqs = tokenizer.texts_to_sequences(text)
    seqs = [[vocab._word_to_id['<s>']]+seq+[vocab._word_to_id['<\s>']] for seq in seqs]
    max_len_seq = max([len(s) for s in seqs])
    seqs_padded = keras.preprocessing.sequence.pad_sequences(seqs, maxlen=max_len_seq, padding="post")
    return seqs_padded

def greedy_search(encoder_input,model,vocab,max_len_sum = 30):
    """Function which returns a summary by always picking the highest probability option conditioned on the previous word"""
    encoder_init_states = [tf.zeros((1, model.encoder.hidden_units)) for i in range(2)]
    encoder_output, encoder_states = model.encoder(encoder_input,encoder_init_states)
    decoder_state = encoder_states[0]

    decoder_input_t = tf.ones(1)*vocab._word_to_id['<s>']
    summary = [vocab._word_to_id['<s>']]
    coverage_vector = tf.zeros((1,encoder_input.shape[1]))
    while decoder_input_t[0].numpy()!=vocab._word_to_id['<\s>'] and len(summary)<max_len_sum:
        context_vector, attention_weights, coverage_vector = model.attention_model(decoder_state, encoder_output,coverage_vector)
        p_vocab, decoder_state = model.decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output,context_vector)
        decoder_input_t = tf.argmax(p_vocab,axis=1)
        decoder_word_idx = int(decoder_input_t[0].numpy())
        summary.append(decoder_word_idx)
    return summary

def beam_search(encoder_input,model,vocab,beam_size=4,n_keep=4,max_len_sum=30):
    encoder_init_states = [tf.zeros((1, model.encoder.hidden_units)) for i in range(2)]
    encoder_output, encoder_states = model.encoder(encoder_input,encoder_init_states)
    decoder_state = encoder_states[0]

    coverage_vector = tf.zeros((1,encoder_input.shape[1]))
    candidates = [[0,[vocab._word_to_id['<s>']],[decoder_state,coverage_vector]]]
    not_terminated = True
    longest_sum = 0
    while not_terminated and longest_sum<max_len_sum:
        new_candidates = []
        for c_idx,cand in enumerate(candidates):
            if cand[1][-1]!=vocab._word_to_id['<\s>']:
                decoder_input_t = tf.ones(1)*cand[1][-1]
                decoder_state, coverage_vector = cand[2]
                context_vector, attention_weights, coverage_vector = model.attention_model(decoder_state, encoder_output,coverage_vector)
                p_vocab, decoder_state = model.decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output,context_vector)
                values,indicies = tf.math.top_k(p_vocab,k=beam_size)
                for val,idx in zip(values.numpy()[0],indicies.numpy()[0]):
                    new_idx_list = cand[1] + [idx]
                    new_val = cand[0] + val
                    new_candidates.append([new_val,new_idx_list,[decoder_state, coverage_vector]])
            else:
                new_candidates.append(cand)
        candidates = sorted(new_candidates,key=lambda x:x[0]/len(x[1]),reverse=True)[:n_keep]
        not_terminated = sum([cand[1][-1]!=vocab._word_to_id['<\s>'] for cand in candidates])>0
        longest_sum = max([len(cand[1]) for cand in candidates])

    return candidates

def masked_nll_loss(p_vocab,target):
    """Calculate negative log-likelihood loss and use mask to ignore padding"""
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss = -p_vocab
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return loss

def coverage_loss(attention_weights,coverage_vector,target):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    coverage_vector = tf.expand_dims(coverage_vector,axis=2)
    ct_min = tf.reduce_min(tf.concat([attention_weights,coverage_vector],axis=2),axis=2)
    cov_loss = tf.reduce_sum(ct_min,axis=1)
    mask = tf.cast(mask, dtype=cov_loss.dtype)
    cov_loss *= mask
    return cov_loss

MODELS

In [8]:
import tensorflow as tf
from tensorflow import keras

class Encoder(keras.Model):
    """Bi-directional GRU encoder"""
    def __init__(self, vocab_size, embedding_dim, hidden_units,embedding_matrix):
        super().__init__()

        self.hidden_units = hidden_units
        if embedding_matrix is not None:
            self.embedding = keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix])
        else:
            self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.bi_gru = keras.layers.Bidirectional(keras.layers.GRU(
                hidden_units,
                return_sequences=True,
                return_state=True,
                recurrent_initializer='glorot_uniform',
            ))

    def call(self,encoder_input,encoder_states):
        """Forward pass of encoder
        Args:
        encoder_input(int tensor: (batch_size,seq_length) ): sequence(s) of vocabulary ids
        encoder_states(list, len=2): encoder forward and backward state

        Returns:
        encoder_output(float tensor: (batch_size,seq_length,hidden_dim) ): encoded space of each sequence
        encoder_states(list, len=2): updated encoder states
        """

        encoder_emb = self.embedding(encoder_input)
        encoder_output, state_fwd, state_back = self.bi_gru(encoder_emb,initial_state=encoder_states)
        encoder_states = [state_fwd,state_back]

        return encoder_output, encoder_states

class BahdanauAttention(keras.Model):
    """Attention layer as described in: Neural Machine Translation by Jointly Learning to Align and Translate"""
    def __init__(self, hidden_units,is_coverage=False):
        super().__init__()

        self.Wh = keras.layers.Dense(hidden_units) # weight matrix for encoder hidden state
        self.Ws = keras.layers.Dense(hidden_units) # weight matrix for decoder state
        self.V = keras.layers.Dense(1)
        self.coverage = is_coverage
        if self.coverage is False:
            self.wc = keras.layers.Dense(1,kernel_initializer='zeros') # weight vector for coverage
            self.wc.trainable = False
        else:
            self.wc = keras.layers.Dense(1)

    def call(self, decoder_state, encoder_output,coverage_vector):
        """Forward pass of attention layer
        Args:
        decoder_state(float tensor: (batch_size,hidden_dim) )
        encoder_output(float tensor: (batch_size,seq_length,hidden_dim) )
        coverage_vector(float tensor: (batch_size,seq_length) )

        Returns:
        context_vector(float tensor: (batch_size,hidden_dim) )
        attention_weights(float tensor: (batch_size,seq_length) )
        coverage_vector(float tensor: (batch_size,seq_length) )
        """

        # calculate attention scores
        decoder_state = tf.expand_dims(decoder_state, 1)
        coverage_vector = tf.expand_dims(coverage_vector, 1)
        score = self.V(tf.nn.tanh(
                        self.Wh(encoder_output) +
                        self.Ws(decoder_state) +
                        self.wc(coverage_vector)
                        ))

        attention_weights = tf.nn.softmax(score, axis=1)
        coverage_vector = tf.squeeze(coverage_vector,1)
        if self.coverage is True:
          coverage_vector+=tf.squeeze(attention_weights)
        context_vector = attention_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights, coverage_vector

class Decoder(keras.Model):
    """Bi-directional GRU decoder with two dense layers in the end to model the vocabulary distribution"""
    def __init__(self, vocab_size, embedding_dim, hidden_units,embedding_matrix):
        super().__init__()

        self.hidden_units = hidden_units
        if embedding_matrix is not None:
            self.embedding = keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix])
        else:
            self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform',
        )
        self.W1 = keras.layers.Dense(hidden_units)
        self.W2 = keras.layers.Dense(vocab_size)


    def call(self, decoder_input, decoder_state, encoder_output,context_vector):
        """Forward pass of decoder

        Args:
        decoder_input(int tensor: (batch_size,1) )
        decoder_state(float tensor: (batch_size,hidden_dim) )
        encoder_output(float tensor: (batch_size,seq_length,hidden_dim) )
        coverage_vector(float tensor: (batch_size,seq_length))

        Returns:
        p_vocab(float tensor: (batch_size,vocab_size) )
        decoder_state(float tensor: (batch_size,hidden_dim) )
        """

        decoder_emb = self.embedding(decoder_input) # (batch_size, seq_length, hidden_units)
        decoder_output , decoder_state = self.gru(decoder_emb,initial_state=decoder_state)
        concat_vector = tf.concat([context_vector,decoder_state], axis=-1)
        concat_vector = tf.reshape(concat_vector, (-1, concat_vector.shape[1]))
        p_vocab = tf.nn.log_softmax(self.W2(self.W1(concat_vector)))

        return p_vocab, decoder_state

MAIN

In [None]:
import os
import argparse
import tensorflow as tf
from tensorflow import keras
import numpy as np
from tqdm import tqdm

class PointerGenerator:
    def __init__(self):
        #self.datapath = args.DATAPATH
        #python your_script.py --DATAPATH custom_path --batch_size 32 --epochs 20 --edim 512 --hdim 256 --vdim 30000

        self.batch_size = 32
        self.n_epochs = 20
        self.vocab_size = 30000
        self.embedding_dim = 512
        self.hidden_dim = 256
        self.encoder = Encoder(self.vocab_size+2, self.embedding_dim, self.hidden_dim, embedding_matrix=None) #+2 on vocab size due to start and end token
        self.attention_model = BahdanauAttention(self.hidden_dim,is_coverage=True)
        self.decoder = Decoder(self.vocab_size+2, self.embedding_dim, self.hidden_dim, embedding_matrix=None)
        self.optimizer = keras.optimizers.Adam()

    @tf.function
    def train_step(self,encoder_input, decoder_target):
        """Function which performs one training step"""
        loss = tf.zeros(self.batch_size)
        lambda_cov = 1
        with tf.GradientTape() as tape:
            encoder_init_states = [tf.zeros((self.batch_size, self.hidden_dim)) for i in range(2)]
            encoder_output, encoder_states = self.encoder(encoder_input,encoder_init_states)
            decoder_state = encoder_states[0] # alternative interpolate between forward and backward state
            coverage_vector = tf.zeros((self.batch_size,encoder_input.shape[1]))
            for t in range(decoder_target.shape[1]-1):
                decoder_input_t = decoder_target[:,t]
                decoder_target_t = decoder_target[:,t+1]
                context_vector, attention_weights, coverage_vector = self.attention_model(decoder_state, encoder_output,coverage_vector)
                p_vocab,decoder_state = self.decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output,context_vector)
                # for each batch get the probability of the target word at time t+1
                p_vocab_list = []
                for i in range(len(decoder_target_t)):
                    p_vocab_list.append(p_vocab[i,decoder_target_t[i]])
                p_vocab_target = tf.stack(p_vocab_list)
                # calculate the loss at each time step t and add to current loss
                loss += masked_nll_loss(p_vocab_target,decoder_target_t) + lambda_cov*coverage_loss(attention_weights,coverage_vector,decoder_target_t)

            # get the non-padded length of each sequence in the batch
            seq_len_mask = tf.cast(tf.math.logical_not(tf.math.equal(decoder_target, 0)),tf.float32)
            batch_seq_len = tf.reduce_sum(seq_len_mask,axis=1)
            batch_loss = tf.reduce_mean(loss/batch_seq_len)

        variables = self.encoder.trainable_variables + self.decoder.trainable_variables
        gradients = tape.gradient(batch_loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss


    def train(self):
        # load text data
        body_data_train = train_article
        target_data_train = train_summary
        body_data_valid = validation_article
        target_data_valid = validation_summary

        # define vocabulary and tokenizer
        tokenizer = keras.preprocessing.text.Tokenizer(num_words=self.vocab_size, oov_token='<UNK>')
        tokenizer.fit_on_texts(body_data_train)
        tokenizer.index_word[self.vocab_size] = '<s>' # add sentence start token
        tokenizer.index_word[self.vocab_size+1] = '<\s>' # add sentence end token
        vocab = Vocab(tokenizer,self.vocab_size+2)

        # create datasets
        body_seqs_train = text2seq(body_data_train,tokenizer,vocab)
        target_seqs_train = text2seq(target_data_train,tokenizer,vocab)
        body_seqs_valid = text2seq(body_data_valid,tokenizer,vocab)
        target_seqs_valid = text2seq(target_data_valid,tokenizer,vocab)
        train_dataset = tf.data.Dataset.from_tensor_slices((body_seqs_train,target_seqs_train))
        train_dataset = train_dataset.shuffle(len(body_seqs_train)).batch(self.batch_size, drop_remainder=True)
        valid_dataset = tf.data.Dataset.from_tensor_slices((body_seqs_valid,target_seqs_valid))
        valid_dataset = valid_dataset.shuffle(len(body_seqs_valid)).batch(1, drop_remainder=True)

        # run one batch through model to initialize parameters
        encoder_input, decoder_target = next(iter(train_dataset))
        encoder_init_states = [tf.zeros((self.batch_size, self.hidden_dim)) for i in range(2)]
        encoder_output, encoder_states = self.encoder(encoder_input,encoder_init_states)
        decoder_state = encoder_states[0]
        coverage_vector = tf.zeros((self.batch_size,encoder_input.shape[1]))
        decoder_input_t = decoder_target[:,0]
        context_vector, attention_weights, coverage_vector = self.attention_model(decoder_state, encoder_output,coverage_vector)
        p_vocab,decoder_state = self.decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output,context_vector)

        # training loop
        epoch_loss = keras.metrics.Mean()
        for epoch in range(self.n_epochs):
            epoch_loss.reset_states()

            with tqdm(total=len(body_seqs_train) // self.batch_size) as batch_progress:
                for batch, (encoder_input, decoder_target) in enumerate(train_dataset):
                    batch_loss = self.train_step(encoder_input, decoder_target)
                    epoch_loss(batch_loss)

                    if (batch % 10) == 0:
                        batch_progress.set_description(f'Epoch {epoch + 1}')
                        batch_progress.set_postfix(Batch=batch, Loss=batch_loss.numpy())
                        batch_progress.update()

            self.eval()

    def eval(self,vocab,valid_dataset):
        encoder_input, decoder_target = next(iter(valid_dataset))
        encoder_input_sum = tf.expand_dims(encoder_input[0,:],0)
        greedy_summary = greedy_search(encoder_input_sum,self,vocab)
        beam_summaries = beam_search(encoder_input_sum,self,vocab,beam_size=4,n_keep=4)
        target_summary = [d for d in decoder_target.numpy()[0] if d!=0]

        print("Greedy search:"+vocab.decode_seq(greedy_summary))
        print("Top 4 beam search: \n","\n".join([vocab.decode_seq(summary[1]) for summary in beam_summaries]))
        print("Target:"+vocab.decode_seq(target_summary))



'''parser = argparse.ArgumentParser()
parser.add_argument('--DATAPATH',default="title-gen-5m-tok",type=str)
parser.add_argument('--batch_size', default=16, type=int)
parser.add_argument('--epochs', default=10, type=int)
parser.add_argument('--edim', default=256, type=int)
parser.add_argument('--hdim', default=128, type=int)
parser.add_argument('--vdim', default=20000, type=int)
args = parser.parse_args()'''

pointgen = PointerGenerator()
pointgen.train()




  0%|          | 0/115 [00:00<?, ?it/s]