In [2]:
corpus = """
I drink and I know things.
When you play the game of thrones, your win or you die.
The true enemy won't wait out the storm, He brings the storm.
"""

print(corpus)


I drink and I know things.
When you play the game of thrones, your win or you die.
The true enemy won't wait out the storm, He brings the storm.



In [30]:
import unicodedata

text = ""
for c in unicodedata.normalize("NFD", "Día NIÑO Álvaro").lower():
    # print(unicodedata.category(c))
    if unicodedata.category(c) != "Mn":
        text += c
text

'dia nino alvaro'

In [None]:
import re

vocab = {}

for token in re.sub(r"\s+", " ", re.sub(r"[.,\n]", " ", corpus)).strip().split(" "):
    if token in vocab:
        vocab[token] += 1
    else:
        vocab[token] = 1

vocab

{'I': 2,
 'drink': 1,
 'and': 1,
 'know': 1,
 'things': 1,
 'When': 1,
 'you': 2,
 'play': 1,
 'the': 3,
 'game': 1,
 'of': 1,
 'thrones': 1,
 'your': 1,
 'win': 1,
 'or': 1,
 'die': 1,
 'The': 1,
 'true': 1,
 'enemy': 1,
 "won't": 1,
 'wait': 1,
 'out': 1,
 'storm': 2,
 'He': 1,
 'brings': 1}

In [24]:
import re

vocab = []

for token in re.sub(r"\s+", " ", re.sub(r"[.,\n]", " ", corpus)).strip().split(" "):
    if not token in vocab:
        vocab.append(token)

vocab

['I',
 'drink',
 'and',
 'know',
 'things',
 'When',
 'you',
 'play',
 'the',
 'game',
 'of',
 'thrones',
 'your',
 'win',
 'or',
 'die',
 'The',
 'true',
 'enemy',
 "won't",
 'wait',
 'out',
 'storm',
 'He',
 'brings']

In [45]:
import pandas
import numpy

report1 = pandas.DataFrame(vocab, columns=["Token"])
report1.index = numpy.array(report1.index) + 1
report1.T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
Token,I,drink,and,know,things,When,you,play,the,game,...,die,The,True,enemy,won't,wait,out,storm,He,brings


In [53]:
def text2tokens(text):
    return re.sub(r"\s+", " ", re.sub(r"[.,\n]", " ", text)).strip().split(" ")

In [59]:
def tokens2vec(tokens):
    return [vocab.index(token) for token in tokens]

In [122]:
text = "When you play game of thrones know"
inputs_tokens = text2tokens(text)
inputs = tokens2vec(inputs_tokens)
inputs

[5, 6, 7, 9, 10, 11, 3]

In [123]:
numpy.random.seed(123)
embedding = numpy.random.rand(6, len(inputs))
embedding.round(2)

array([[0.7 , 0.29, 0.23, 0.55, 0.72, 0.42, 0.98],
       [0.68, 0.48, 0.39, 0.34, 0.73, 0.44, 0.06],
       [0.4 , 0.74, 0.18, 0.18, 0.53, 0.53, 0.63],
       [0.85, 0.72, 0.61, 0.72, 0.32, 0.36, 0.23],
       [0.29, 0.63, 0.09, 0.43, 0.43, 0.49, 0.43],
       [0.31, 0.43, 0.89, 0.94, 0.5 , 0.62, 0.12]])

In [124]:
pandas.DataFrame([inputs_tokens, inputs])

Unnamed: 0,0,1,2,3,4,5,6
0,When,you,play,game,of,thrones,know
1,5,6,7,9,10,11,3


In [125]:
pandas.DataFrame(embedding, columns=[f"e{i + 1}" for i in range(len(inputs))])

Unnamed: 0,e1,e2,e3,e4,e5,e6,e7
0,0.696469,0.286139,0.226851,0.551315,0.719469,0.423106,0.980764
1,0.68483,0.480932,0.392118,0.343178,0.72905,0.438572,0.059678
2,0.398044,0.737995,0.182492,0.175452,0.531551,0.531828,0.634401
3,0.849432,0.724455,0.611024,0.722443,0.322959,0.361789,0.228263
4,0.293714,0.630976,0.092105,0.433701,0.430863,0.493685,0.42583
5,0.312261,0.426351,0.893389,0.94416,0.501837,0.623953,0.115618


In [128]:
positional = numpy.zeros_like(embedding)

dim = 6
n = len(inputs)

for pos in range(n):
    for i in range(dim):
        if i % 2 == 0: # even
            # print(i, pos, numpy.sin(pos / (10_000 ** ((2 * i) / dim))))
            positional[i, pos] = numpy.sin(pos / (10_000 ** ((2 * i) / dim)))
        else: # odd
            # print(i, pos, numpy.cos(pos / (10_000 ** ((2 * i) / dim))))
            positional[i, pos] = numpy.cos(pos / (10_000 ** ((2 * i) / dim)))

positional.round(2)

array([[ 0.  ,  0.84,  0.91,  0.14, -0.76, -0.96, -0.28],
       [ 1.  ,  1.  ,  1.  ,  0.99,  0.98,  0.97,  0.96],
       [ 0.  ,  0.  ,  0.  ,  0.01,  0.01,  0.01,  0.01],
       [ 1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ]])

In [129]:
pandas.DataFrame(positional, columns=[f"p{i + 1}" for i in range(len(inputs))])

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7
0,0.0,0.841471,0.909297,0.14112,-0.756802,-0.958924,-0.279415
1,1.0,0.998923,0.995694,0.990321,0.982814,0.97319,0.96147
2,0.0,0.002154,0.004309,0.006463,0.008618,0.010772,0.012926
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,5e-06,9e-06,1.4e-05,1.9e-05,2.3e-05,2.8e-05
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [131]:
embedding_positional = embedding + positional

pandas.DataFrame(embedding_positional, columns=[f"ep{i + 1}" for i in range(len(inputs))])

Unnamed: 0,ep1,ep2,ep3,ep4,ep5,ep6,ep7
0,0.696469,1.12761,1.136149,0.692435,-0.037334,-0.535818,0.701349
1,1.68483,1.479855,1.387812,1.333499,1.711864,1.411762,1.021148
2,0.398044,0.74015,0.186801,0.181915,0.540169,0.5426,0.647327
3,1.849432,1.724455,1.611023,1.722443,1.322959,1.361789,1.228263
4,0.293714,0.630981,0.092114,0.433715,0.430881,0.493708,0.425858
5,1.312261,1.426351,1.893389,1.94416,1.501837,1.623953,1.115618
