<a href="https://colab.research.google.com/github/dmakarau/LLM_explore/blob/main/part1_text2num_preparingText4Tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import requests
import re

# Get a text from Web

In [None]:
# get the text from the internet
book = requests.get('https://gutenberg.org/cache/epub/35/pg35.txt')

# get text from the response
text = book.text


In [None]:
# replace character strings with space
stringsToReplace = [
                 '\r\n\r\nâ\x80\x9c', # new paragraph
                 'â\x80\x9c',         # open quote
                 'â\x80\x9d',         # close quote
                 '\r\n',              # new line
                 'â\x80\x94',         # hyphen
                 'â\x80\x99',         # single apostrophe
                 'â\x80\x98',         # single quote
                 '_',                 # underscore, used for stressing
                 ]

In [None]:
# use regex to replace those strings with space
for string in stringsToReplace:
  regex = re.compile(r'%s'%string)
  text = regex.sub(' ', text)

# remove non ASCII chars
text = re.sub(r'[^\x00-\x7F]+', ' ', text)

# remove numbers
text = re.sub(r'\d+', ' ', text)

# make everything lowercase
text = text.lower()

text[:2000]

# Parsing a text to words

In [None]:
# split text by punctuation
import string
puncts4re = rf'[{string.punctuation}\s]+'

words = re.split(puncts4re, text)
words = [item.strip() for item in words if item.strip()]


# remove single-character words
words = [item for item in words if len(item) > 1]


In [None]:
# create a vocac - unique words
vocab = sorted(set(words))
# convencience variables
words_length = len(words)
vocab_length = len(vocab)
print(f'Number of words: {words_length}')
print(f'Number of unique words: {vocab_length}')

# Create token dictionaries and encoder/decoder  functions

In [None]:
word2idx = { word :index  for index, word in enumerate(vocab)}
idx2word = { index :word  for index, word in enumerate(vocab)}

for i in list(word2idx.items())[:10000:87]:
  print(i)


In [None]:
#encode
def encode(word_list, encode_dict):
  # init a vector with numberical indices
  idxs = np.zeros(len(word_list), dtype=int)

  # loop through the words and find their indices in vocab
  for index, word in enumerate(word_list):
    idxs[index] = encode_dict[word]

  return idxs

# decode
def decode(idx, decode_dict):
  return ' '.join([ decode_dict[i] for i in idx ])


In [None]:
# testing the encode
print(encode(['the', 'time', 'machine'], word2idx))

# testing the decode
print(decode([4338, 4405, 2596], idx2word))

In [None]:
# test encode then decode

# random start location
startidx = np.random.choice(words_length - 10)

# sequential word indices
idxs = np.arange(startidx, startidx + 10)

print("Word indices:")
print(idxs), print('')

print("The words")
wordseq = [ words[i] for i in idxs ]
print(wordseq), print('')

print('Token indices:')
tokenseq = encode(wordseq, word2idx)
print(tokenseq), print('')

print('Decoded sentence:')
print(decode(tokenseq, idx2word))


# Exercise 2 A random walk through the time machine

In [None]:
# A Brief Aside of Brownian Noise
import matplotlib.pyplot as plt
brownNoise = np.cumsum(np.random.choice([-1, 1], 3000))

plt.figure(figsize=(10,3))
plt.plot(brownNoise, 'k')
plt.gca().set(xlim= [0, len(brownNoise)], xlabel = '"Time" ?', ylabel = 'Signal amplitude')
plt.show()

In [None]:
# Brownian noise
brownNoise = np.cumsum(np.random.choice([-1, 1], 30))
print(brownNoise)

BrownianRandomTokens = brownNoise + np.random.choice(vocab_length, 1)

# test with random token indices
print(f'Token indices: {BrownianRandomTokens}')
print(f'Decoded sentence: {decode(BrownianRandomTokens, idx2word)}')


# Exercise 3. Distribution of Words Length

In [None]:
import matplotlib.pyplot as plt

wordsCount = []
for word in words:
  wordsCount.append(len(word))

# Create a list of indices for the words in the order they appear in the text
word_positions = list(range(len(words)))


# Create the scatter plot
plt.figure(figsize=(15, 6)) # Increased width to 15
plt.scatter(word_positions, wordsCount, s=5) # Use s to adjust marker size

# Add labels and title
plt.xlabel('Word Position in Text')
plt.ylabel('Word Length')
plt.title('Word Length by Position in Text')

# Display the plot
plt.show()

plt.hist(wordsCount,rwidth=0.8)
plt.xlabel('Word Length')
plt.ylabel('Frequency')
plt.title('Histogram of Character Count Frequences')
plt.show()