In [3]:
import re
import string
import numpy as np
import matplotlib.pyplot as plt

In [4]:
with open("the_verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))


Total number of character: 21937


In [5]:
def process_text(text):
    # lower case
    # text = text.lower()
    # '\\n' character removed:
    text=text.replace("\\n"," " )
    # remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    # # remove digits
    # text = ''.join([char for char in text if char not in string.digits])
    # remove stopwords
    # stop_words = stopwords.words('english')
    # words = nltk.word_tokenize(text)
    # filtered_words =[word for word in words if word not in stop_words]
    # filtered_text = " ".join(filtered_words)
    return text

In [6]:
# preprocessed_text = re.sub(r'([,.?_!"()\']|--|\s)'," ", raw_text).split()
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
len(preprocessed)

4886

### Converting tokens into token IDS
Build vocabulary set be removing duplicate words. Then arrange all the unique words in alphabatical order and assign each word with an integer ID

In [7]:
# use set() method to have all the words unique
all_words= sorted(set(preprocessed))
vocab = {token:integer for integer,token in enumerate(all_words)}
len(all_words)

1266

## Encoding Decoding
Encoding means assigning each word a unique ID. decoding means getting back the word from the integer ID.

In [8]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {integer: string for string,integer in vocab.items()}
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
       
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1',text)
        return text
    

In [9]:
text=""""It's the last he painted, you know," Mrs. Gisburn said"""
tokenizer = SimpleTokenizerV1(vocab)
ids=tokenizer.encode(text)
regenerated_text = tokenizer.decode(ids)
print(ids)
print(regenerated_text)

[1, 86, 2, 969, 1114, 694, 618, 853, 6, 1262, 687, 6, 1, 102, 8, 66, 970]
" It' s the last he painted, you know," Mrs. Gisburn said


## Adding  special tokens: `<unk>` and `<|endoftext|>` 
We add special tokens to a vocabulary to deal with certain contexts. For instance, we
add an `<|unk|>` token to represent new and unknown words that were not part of the training
data and thus not part of the existing vocabulary.
<br>
When working with multiple independent text source, we add `<|endoftext|>` tokens
between these texts. These <|endoftext|> tokens act as markers, signaling the start or end of a particular segment, allowing for more effective processing and understanding by the LLM.

In [14]:
# Adding `<|endoftext|>` token 
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|endoftext|>','<unk>'])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
vocab['<unk>']

1267

In [17]:
# Improving tokenizer class
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {integer: string for string,integer in vocab.items()}
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else '<unk>' for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
       
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1',text)
        return text

In [26]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join([text1,text2])
tokenizer =SimpleTokenizerV2(vocab)
ids=  tokenizer.encode(text)
regen_text = tokenizer.decode(ids)
print(ids)
print(regen_text)

[1267, 6, 431, 1262, 724, 1099, 18, 1266, 84, 1114, 1080, 1110, 826, 1114, 1267, 8]
<unk>, do you like tea? <|endoftext|> In the sunlit terraces of the <unk>.
