<a href="https://colab.research.google.com/github/dmakarau/LLM_explore/blob/main/text2numbers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Create and visualize tokens**

In [None]:
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# list of sentences
text = [ 'All that we are is the result of what we have thought',
         'To be or not to be that is the question',
         'Be yourself everyone else is already taken' ]

In [None]:
# create a vocab of unique words
allwords = []
for phrase in text:
  words = re.split(r'\s', phrase.lower())
  allwords.extend(words)
vocab = sorted(set(allwords))
vocab

In [None]:
# create an enconder and decoder disctionaries
word2idx = {}
for i, word in enumerate(vocab):
  word2idx[word] = i
idx2word = {}
for i, word in enumerate(vocab):
  idx2word[i] = word

word2idx

In [None]:
# create the encode function that takes text input and gives a list of integers as tokens
def encode(phrase):
  words = re.split(r'\s', phrase.lower())
  return [ word2idx[word] for word in words ]


In [None]:
tokens = encode('we already are the result of what everyone else already thought')
print(tokens)

In [None]:
# create the decode function that takes list of tokens as integers and gives the text
def decode(tokens_list):
  return ' '.join(idx2word[i] for i in tokens_list)


In [None]:
phrase = decode(tokens)
print(phrase)

In [None]:
print(vocab) # Just a reminder what words in the vocabulary do we have

In [None]:
# create a phrase, using vocab
another_phrase = 'we have thought what is the question'
# encode text to tokens
tokens = encode(another_phrase)
print(tokens)

# decode tokens to text
phrase_decoded = decode(tokens)
print(phrase_decoded)

***Visualize the tokenized integers***

In [None]:
# get all the text and tokens
alltext = ' '.join(text)
tokens = encode(alltext)
print(tokens)

# create a figure
_,ax = plt.subplots(1, figsize=(12,5))

# plot the tokens
ax.plot(tokens, 'ks', markersize = 12, markerfacecolor = [.7, .7, .9])
ax.set(xlabel='Word index',yticks=range(len(vocab)))
ax.grid(linestyle='--', axis='y')

# # invisible axis for right-hand-side labels
ax2 = ax.twinx()
ax2.plot(tokens, alpha=0)
ax2.set(yticks = range(len(vocab)), yticklabels = vocab)

plt.show()


***Explore context surrounding target tokens***

In [None]:
indexes = []
for index, word in enumerate(allwords):
  if word == 'to':
    indexes.append(index)
print(f' "to" appears at indices {indexes}')

In [None]:
for t in indexes:
  print(allwords[t - 1: t + 2])

In [None]:
print(allwords)

In [None]:
print(vocab)

In [None]:
word_matrix = np.zeros((len(allwords), len(vocab)), dtype=int)
for i, word in enumerate(allwords):
  index = word2idx[word]
  word_matrix[i, index] = 1
print(word_matrix)

In [None]:
# create a figure and axes
fig, ax = plt.subplots(1, figsize=(12,5))

# plot the word matrix
ax.imshow(1-word_matrix.T, cmap='gray', origin='lower', aspect='auto')

# set labels and ticks
ax.set(xlabel='Word index', yticks=range(len(vocab)))
ax.grid(linestyle='--', axis='y')

plt.show()