In [1]:
import math
from typing import List

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[str]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (str): padding token
    @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
    """
    sents_padded = []

    ### YOUR CODE HERE (~6 Lines)
    
    # first find max length of sentences
    sent_lens = [len(unique_sents) for unique_sents in sents]
    max_len = max(sent_lens)
    
    # we then iterate through each while len != max_len
    for unique_sents in sents:
        
        while len(unique_sents) < max_len:
            unique_sents.append(pad_token)
        
        # we then append outside of while loop: those that pass initially will be appended automatically
        sents_padded.append(unique_sents)

    ### END YOUR CODE
    return sents_padded

Going to build a test for this: 

- if we have two sentences, one of length 3 and one of length 4, then what is expected output? 

    - "my test sentence"
    - "my second test sentence"

- expected output: 

    - "my test sentence <pad_token>" (does it matter if at end or beginning?)
    - "my second test sentence"

In [4]:
pad_token_test = '<PADDING>'

test_sents = [
    ['my', 'test', 'sentence'],
    ['my', 'second','test', 'sentence'],
    ['my', 'longer', 'third', 'test', 'sentence'],
    ['my', 'longer', 'fourth', 'test', 'sentence']
]

test_ans = [
    ['my', 'test', 'sentence', pad_token_test, pad_token_test],
    ['my', 'second','test', 'sentence', pad_token_test],
    ['my', 'longer', 'third', 'test', 'sentence'],
    ['my', 'longer', 'fourth', 'test', 'sentence']
]

# run test: 
test_output = pad_sents(sents = test_sents, pad_token = pad_token_test)

assert test_output == test_ans


In [5]:
def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[str]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (str): padding token
    @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
    """
    sents_padded = []

    ### YOUR CODE HERE (~6 Lines)
    
    longest = max([len(sent) for sent in sents])
    sents_padded = list(map(lambda sent: sent+[pad_token]*(longest-len(sent)), sents))
    ### END YOUR CODE

    return sents_padded

In [6]:
test_output_2 = pad_sents(sents = test_sents, pad_token = pad_token_test)

In [7]:
test_output

[['my', 'test', 'sentence', '<PADDING>', '<PADDING>'],
 ['my', 'second', 'test', 'sentence', '<PADDING>'],
 ['my', 'longer', 'third', 'test', 'sentence'],
 ['my', 'longer', 'fourth', 'test', 'sentence']]

In [8]:
test_output_2

[['my', 'test', 'sentence', '<PADDING>', '<PADDING>'],
 ['my', 'second', 'test', 'sentence', '<PADDING>'],
 ['my', 'longer', 'third', 'test', 'sentence'],
 ['my', 'longer', 'fourth', 'test', 'sentence']]

In [9]:
test_output == test_output_2

True

In [2]:
def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = line.strip().split(' ')
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data


def batch_iter(data, batch_size, shuffle=False):
    """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents