In [4]:
import io
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import get_tokenizer
import re

def yield_tokens(file_path, tokenizer):
    with io.open(file_path, encoding = 'utf-8') as file:
        for line in file:
            if re.search('[a-zA-Z]', line):
                yield tokenizer(line.lower().replace('\n', '').strip())
                

In [116]:
textfile = "data/discourses.mb.txt"
tokenizer = get_tokenizer("spacy")
contains_chars = lambda x: re.search('[a-zA-Z]', x)

vocab = build_vocab_from_iterator(yield_tokens(textfile, tokenizer), specials=["<unk>", "<eos>", "<sos>", "<pad>"])
vocab.set_default_index(-1)

In [117]:
input_sequences = []

with io.open(textfile, encoding = 'utf-8') as file:
    for line in file.readlines():
        if contains_chars(line):
            line = line.lower().replace('\n', '').strip()
            words = tokenizer(line)
            tokens = [vocab[word] for word in words]
            for i in range(1, len(tokens)): 
                n_gram_seqs = tokens[:i+1]
                input_sequences.append(n_gram_seqs)

In [118]:
num_tokens = len(input_sequences)
d_embedding = 32
d_model = 32
dropout = 0.1
seq_len = 1024

In [177]:
import torch.nn.functional as F

def padding(arr, seq_len, mode="right"):
    assert mode=="right" or mode=="left", "invalid padding mode"
    if mode=="right":
        return F.pad(arr, (0, seq_len-arr.nelement()))
    else:
        return F.pad(arr, (seq_len-arr.nelement()), 0)

In [178]:
import torch
import torch.nn as nn

luc = nn.Embedding(num_tokens, embedding_dim)

In [181]:
example = input_sequences[-1]
example_padded = padding(torch.tensor(example), seq_len)
embedding_func = lambda x: luc(torch.tensor(x, dtype=torch.long))
example_embedding = torch.tensor([luc(token_num).tolist() for token_num in example_padded])
example_embedding.shape

torch.Size([1024, 32])

In [183]:
import numpy as np
import matplotlib.pyplot as plt
 
def positional_encoding(seq_len, d, n=10000):
    p = np.zeros((seq_len, d))
    for k in range(seq_len):
        for i in np.arange(int(d/2)):
            denominator = np.power(n, 2*i/d)
            p[k, 2*i] = np.sin(k/denominator)
            p[k, 2*i+1] = np.cos(k/denominator)
    return p

p_embedding = torch.tensor(positional_encoding(seq_len=1024, d=32, n=10000))
p_embedding.shape

torch.Size([1024, 32])

In [188]:
rep = torch.add(example_embedding, p_embedding)
rep.shape

torch.Size([1024, 32])