In [1]:
import torch
import torch.nn as nn

getting spacy for the embeddings

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

failed to get a dataset, so doing my own corpora from sherlock holmes book

In [4]:
# !pip install torchtext
# from torchtext.datasets import IMBD

In [5]:
# #creating sample embeder because i cant import a real dataset aaaaaaaaa
# with open('words.txt', 'r') as f:
#     words = f.read().split('\n')

# # reducing the set size
# words = words[10_000: 20_000]
# # putting test common words
# words += ["hey", "how", "are"]

In [6]:
from string import ascii_letters

with open('sherlock.txt', 'r') as f:
    s = f.read()

allowed = ascii_letters + " " + ''.join([str(i) for i in range(10)])

sherlock = ''.join(filter(
    lambda w: w in allowed,
    s
))

In [7]:
corpora = ["<unk>"] + list(set(sherlock.split()))

In [8]:
# smaller word corpora
# words = ["hey", "how", "are"]

creating the embedding tensor

In [9]:
embedding = torch.zeros([len(corpora), len(nlp("a").vector)])
for i, w in enumerate(corpora):
    embedding[i] = torch.tensor(nlp(w).vector)

In [10]:
embedding.shape # 9000 words embedding

torch.Size([8987, 96])

In [11]:
def get_embed_id(word):
    try:
        return corpora.index(word)
    except:
        return 0

In [12]:
vocab = torch.tensor(
    list(map(get_embed_id, sherlock.split())),
    dtype=torch.long
)
vocab

tensor([3700, 6316, 6057,  ..., 6108, 8842, 2689])

In [13]:
def vocab_fn(words: str):
    """creates the vocab vector for the given phrase."""
    return torch.tensor([
        corpora.index(word) if word in corpora else 0
        for word in words.split()
    ])


In [None]:
print(vocab_fn("Sherlock was a fine man"))

In [22]:
class GPT(nn.Module):
    def __init__(self, d_embeddings, size_corpora, n_heads = 1, n_decoders = 1):
        super(GPT, self).__init__()

        self.n_heads = n_heads 
        self.d_embeddings = d_embeddings 
        self.size_corpora = size_corpora

        self.emb = nn.Embedding.from_pretrained(embedding) # non positional embedding
        self.pos_emb = nn.Embedding(self.size_corpora, self.d_embeddings)

        self.decoder_layer = nn.TransformerDecoderLayer(self.d_embeddings, self.n_heads)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, n_decoders)

        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):

        # create the context vector
        U = torch.zeros([self.size_corpora])
        U[x] = 1
        print("U: ", U)

        # produces the first iteration
        h0 = torch.diag(U) @ self.emb.weight + self.pos_emb.weight
        # print("h0 b4: ", h0)

        # reshapping to simulate a batch size
        shape = h0.shape
        h0 = h0.reshape(shape[0], 1, shape[-1])
        # print("h0 after: ", h0)

        # creates a fake initial memory for the decoder
        initial_memory = torch.zeros_like(h0)

        # run trough the decoder block
        hn = self.decoder(h0, initial_memory)
        print("hn ", hn)

        words_logits = hn @ self.emb.weight.transpose(-1, 0)
        print("words: ", words_logits)
        # gets the softmax distribution
        # word_probability = self.softmax(words_logits)

        return words_logits

gpt = GPT(
    size_corpora=embedding.shape[0],
    d_embeddings=embedding.shape[1]
)

In [23]:
out = gpt(
    vocab_fn("Sherlock was a fine man")
)


U:  tensor([0., 0., 0.,  ..., 0., 0., 0.])
hn  tensor([[[ 0.1213, -1.6085, -1.5906,  ...,  0.3331,  0.9693,  0.2454]],

        [[ 1.2494, -2.7318,  0.8753,  ...,  0.8481,  0.4678,  1.1281]],

        [[ 0.5061, -0.0896,  2.7862,  ..., -0.3724, -0.1957, -0.3039]],

        ...,

        [[ 0.0418, -0.8855,  0.4363,  ...,  0.4515, -2.1305, -1.6950]],

        [[ 0.8093,  0.6005,  0.4486,  ...,  0.2748, -1.2516,  0.2926]],

        [[ 0.6007, -0.0911,  0.4704,  ..., -0.7744, -0.1482, -0.2809]]],
       grad_fn=<NativeLayerNormBackward>)
words:  tensor([[[-11.4843, -10.6799,   4.0268,  ...,  -9.0076,  -5.0612,  -8.0718]],

        [[ -6.9038,  -7.3724,  -4.0375,  ...,   5.9391,  -8.2051,  -0.4288]],

        [[  5.0732,   5.4410,   3.8891,  ...,   7.6700,  -1.3980,   8.5939]],

        ...,

        [[  2.6077,   3.2044,  -5.6825,  ...,  10.8484,   7.1765,  13.2359]],

        [[ -2.4267,  -5.1196,   6.2699,  ...,  -3.2290,  -0.7185,  -1.7215]],

        [[  2.3654,   3.5840,  -1.3256,  .

importing sherlock holmes book to "train"


In [24]:
out

tensor([[[-11.4843, -10.6799,   4.0268,  ...,  -9.0076,  -5.0612,  -8.0718]],

        [[ -6.9038,  -7.3724,  -4.0375,  ...,   5.9391,  -8.2051,  -0.4288]],

        [[  5.0732,   5.4410,   3.8891,  ...,   7.6700,  -1.3980,   8.5939]],

        ...,

        [[  2.6077,   3.2044,  -5.6825,  ...,  10.8484,   7.1765,  13.2359]],

        [[ -2.4267,  -5.1196,   6.2699,  ...,  -3.2290,  -0.7185,  -1.7215]],

        [[  2.3654,   3.5840,  -1.3256,  ...,   2.9127,  -2.1449,   0.9191]]],
       grad_fn=<UnsafeViewBackward>)

In [35]:
# i dont know if this is the right way to interpret the results
prob = out.sum(0).reshape(out.shape[0]).softmax(0)


tensor(4171)

In [None]:
loss_fn = nn.CrossEntropyLoss()
window = 5
x = vocab[0: window]
y = vocab[window + 1]

Y = torch.zeros(len(corpora))
Y[y] = 1

out = gpt(x)
loss = loss_fn(x, Y)

U:  tensor([0., 0., 0.,  ..., 0., 0., 0.])
hn  tensor([[[-0.2938, -1.2402,  1.2166,  ..., -0.7015, -0.2277,  1.1062]],

        [[-1.7482, -0.0586,  0.4162,  ...,  1.0023, -0.1650, -0.0482]],

        [[-1.3509,  1.8449, -1.3508,  ...,  0.8270, -1.3384, -1.9556]],

        ...,

        [[ 0.9983,  0.9896, -1.3176,  ...,  0.5009, -0.2629, -0.9276]],

        [[-1.6401,  1.1135,  0.9188,  ..., -0.3861,  0.4535,  0.7174]],

        [[ 1.2166, -0.7867,  0.1698,  ..., -0.4125,  0.8308,  0.1103]]],
       grad_fn=<NativeLayerNormBackward>)
words:  tensor([[[  7.1969,  -4.7808,   5.3483,  ...,   3.3195,  -2.6626,   5.5213]],

        [[ -3.0338,  -2.2731,   3.4430,  ...,   7.9760,   2.7903,   0.3384]],

        [[  9.1941,  -4.5115,  10.3757,  ...,   3.4800,  11.6178,   3.0097]],

        ...,

        [[ -1.7504,   7.0083,   5.1517,  ...,  -8.2203,  14.9707,  -1.0694]],

        [[  7.2394,  -6.8686,   0.2162,  ...,  -0.2132,  -2.2027,   2.7306]],

        [[-17.4245,   3.0328,  -3.1432,  .

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)