In [1]:
import random
import torch
import numpy as np
from collections import defaultdict

In [2]:
## Prepare data

class VocabEntry(object):
    """docstring for Vocab"""
    def __init__(self, word2id=None):
        super(VocabEntry, self).__init__()

        if word2id:
            self.word2id = word2id
            self.unk_id = word2id['<unk>']
        else:
            self.word2id = dict()
            self.unk_id = 3
            self.word2id['<pad>'] = 0
            self.word2id['<s>'] = 1
            self.word2id['</s>'] = 2
            self.word2id['<unk>'] = self.unk_id

        self.id2word_ = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        return word in self.word2id

    def __len__(self):
        return len(self.word2id)

    def add(self, word):
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid

        else:
            return self[word]

    def id2word(self, wid):
        return self.id2word_[wid]

    def decode_sentence(self, sentence):
        decoded_sentence = []
        for wid_t in sentence:
            wid = wid_t.item()
            decoded_sentence.append(self.id2word_[wid])
        return decoded_sentence


    @staticmethod
    def from_corpus(fname):
        vocab = VocabEntry()
        with open(fname) as fin:
            for line in fin:
                _ = [vocab.add(word) for word in line.split()]

        return vocab


In [3]:
class MonoTextData(object):
    """docstring for MonoTextData"""
    def __init__(self, fname, label=False, max_length=None, vocab=None):
        super(MonoTextData, self).__init__()

        self.data, self.vocab, self.dropped, self.labels = self._read_corpus(fname, label, max_length, vocab)

    def __len__(self):
        return len(self.data)

    def _read_corpus(self, fname, label, max_length, vocab):
        data = []
        labels = [] if label else None
        dropped = 0
        if not vocab:
            vocab = defaultdict(lambda: len(vocab))
            vocab['<pad>'] = 0
            vocab['<s>'] = 1
            vocab['</s>'] = 2
            vocab['<unk>'] = 3

        with open(fname) as fin:
            for line in fin:
                if label:
                    split_line = line.split('\t')
                    lb = split_line[0]
                    split_line = split_line[1].split()
                else:
                    split_line = line.split('\t')[0]
                if len(split_line) < 1:
                    dropped += 1
                    continue

                if max_length:
                    if len(split_line) > max_length:
                        dropped += 1
                        continue

                if label:
                    labels.append(lb)
                data.append([vocab[word] for word in split_line])

        if isinstance(vocab, VocabEntry):
            return data, vocab, dropped, labels

        return data, VocabEntry(vocab), dropped, labels

    def _to_tensor(self, batch_data, batch_first, device):
        """pad a list of sequences, and transform them to tensors
        Args:
            batch_data: a batch of sentences (list) that are composed of
                word ids.
            batch_first: If true, the returned tensor shape is
                (batch, seq_len), otherwise (seq_len, batch)
            device: torch.device
        Returns: Tensor, Int list
            Tensor: Tensor of the batch data after padding
            Int list: a list of integers representing the length
                of each sentence (including start and stop symbols)
        """


        # pad stop symbol
        batch_data = [sent + [self.vocab['</s>']] for sent in batch_data]

        sents_len = [len(sent) for sent in batch_data]

        max_len = max(sents_len)

        batch_size = len(sents_len)
        sents_new = []

        # pad start symbol
        sents_new.append([self.vocab['<s>']] * batch_size)
        for i in range(max_len):
            sents_new.append([sent[i] if len(sent) > i else self.vocab['<pad>'] \
                               for sent in batch_data])


        sents_ts = torch.tensor(sents_new, dtype=torch.long,
                                 requires_grad=False, device=device)

        if batch_first:
            sents_ts = sents_ts.permute(1, 0).contiguous()

        return sents_ts, [length + 1 for length in sents_len]


    def data_iter(self, batch_size, device, batch_first=False, shuffle=True):
        """pad data with start and stop symbol, and pad to the same length
        Returns:
            batch_data: LongTensor with shape (seq_len, batch_size)
            sents_len: list of data length, this is the data length
                       after counting start and stop symbols
        """
        index_arr = np.arange(len(self.data))

        if shuffle:
            np.random.shuffle(index_arr)

        batch_num = int(np.ceil(len(index_arr)) / float(batch_size))
        for i in range(batch_num):
            batch_ids = index_arr[i * batch_size : (i+1) * batch_size]
            batch_data = [self.data[index] for index in batch_ids]

            # uncomment this line if the dataset has variable length
            batch_data.sort(key=lambda e: -len(e))

            batch_data, sents_len = self._to_tensor(batch_data, batch_first, device)

            yield batch_data, sents_len

    

    def create_data_batch(self, batch_size, device, batch_first=False):
        """pad data with start and stop symbol, batching is performerd w.r.t.
        the sentence length, so that each returned batch has the same length,
        no further pack sequence function (e.g. pad_packed_sequence) is required
        Returns: List
            List: a list of batched data, each element is a tensor with shape
                (seq_len, batch_size)
        """
        sents_len = np.array([len(sent) for sent in self.data])
        sort_idx = np.argsort(sents_len)
        sort_len = sents_len[sort_idx]

        # record the locations where length changes
        change_loc = []
        for i in range(1, len(sort_len)):
            if sort_len[i] != sort_len[i-1]:
                change_loc.append(i)
        change_loc.append(len(sort_len))

        batch_data_list = []
        total = 0
        curr = 0
        for idx in change_loc:
            while curr < idx:
                batch_data = []
                next = min(curr + batch_size, idx)
                for id_ in range(curr, next):
                    batch_data.append(self.data[sort_idx[id_]])
                curr = next
                batch_data, sents_len = self._to_tensor(batch_data, batch_first, device)
                batch_data_list.append(batch_data)

                total += batch_data.size(0)
                assert(sents_len == ([sents_len[0]] * len(sents_len)))

        assert(total == len(self.data))
        return batch_data_list

In [4]:
train_data = MonoTextData('dataset/turkish-train-low.txt')
vocab = train_data.vocab
#val_data = MonoTextData('turkish-dev', vocab=vocab)
#tst_data = MonoTextData('turkish-test', vocab=vocab)

In [14]:
token = vocab.word2id # character vocab
token.id2word

AttributeError: 'collections.defaultdict' object has no attribute 'id2word'

In [6]:
dict(token)

{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 '<unk>': 3,
 'o': 4,
 't': 5,
 'u': 6,
 'z': 7,
 ' ': 8,
 'b': 9,
 'i': 10,
 'r': 11,
 'c': 12,
 'h': 13,
 'e': 14,
 'y': 15,
 'k': 16,
 'ö': 17,
 'ü': 18,
 'l': 19,
 'ş': 20,
 'm': 21,
 'a': 22,
 'n': 23,
 'ı': 24,
 'd': 25,
 's': 26,
 'ğ': 27,
 'p': 28,
 'ç': 29,
 'v': 30,
 'g': 31,
 'f': 32,
 'L': 33,
 'j': 34}

In [7]:
train_data_batch = train_data.create_data_batch(batch_size=32, device='cuda', batch_first=True)

In [None]:
## Create model (with transformers)
model = None

In [19]:
## Train
epochs = 50
for epoch in range(epochs):
    loss = 0
    epoch_acc = 0
    epoch_tokens = 0  
    for idx,i in enumerate(np.random.permutation(len(train_data_batch))):
        batch_data = train_data_batch[i]
        batch_size, word_len = batch_data.size()
        # calculate model loss with batch_data and update gradients
        print(f"{idx}: batch_data: {batch_data}, batch_size: {batch_size}, word_len: {word_len}")
    break

0: batch_data: tensor([[ 1, 28, 14, 23, 10, 26,  2],
        [ 1, 26, 22, 21,  6, 11,  2],
        [ 1, 26,  4, 23, 11, 22,  2],
        [ 1, 30, 22, 19, 10,  7,  2],
        [ 1, 29, 24, 16,  5, 24,  2],
        [ 1, 17, 28, 21, 14, 16,  2],
        [ 1, 16, 17, 28, 18, 16,  2],
        [ 1,  7, 10,  9, 10, 19,  2],
        [ 1, 10, 29,  8, 22, 27,  2],
        [ 1,  4,  5,  4, 12,  6,  2],
        [ 1, 22, 19, 29, 22, 16,  2],
        [ 1, 26,  4, 16, 22, 16,  2],
        [ 1,  4, 16, 10, 15, 22,  2],
        [ 1, 25, 22, 19, 22, 16,  2],
        [ 1, 16, 14, 32, 22, 19,  2],
        [ 1, 12, 10, 13, 22,  7,  2],
        [ 1, 16, 14, 32, 22, 19,  2],
        [ 1,  5,  4, 13,  6, 21,  2],
        [ 1, 22, 19,  5, 24, 23,  2],
        [ 1,  9, 22, 13, 29, 14,  2],
        [ 1, 15, 22, 26, 22, 16,  2],
        [ 1, 13, 14, 15, 14,  5,  2],
        [ 1, 16, 22,  9,  6, 16,  2]], device='cuda:0'), batch_size: 23, word_len: 7
1: batch_data: tensor([[ 1, 12, 22, 23, 19, 22, 23, 25, 24, 11, 