## Creating Tokens

In [3]:
#read the txt file
with open('data/astu overview.txt','r',encoding='utf-8') as f:
    raw_text=f.read()
print(f'number of characters:',len(raw_text))

number of characters: 4358


In [38]:
import re

text='hello, ethiopia?. this, is nothing'
token=re.sub(r'([,.?]|\s)',r'\1',text)
token=' '.join(token)
print(token)

h e l l o ,   e t h i o p i a ? .   t h i s ,   i s   n o t h i n g


## removing whitespaces

In [6]:
#remove white space
#removing white space reduce memory and computing requirment
#however some data is sensetive to their structure example python code indentaation and spacing in this case keep the white space

result=[i for i in token if i.strip()]
print(result)

['hello', ',', 'ethiopia', '?', '.', 'this', ',', 'is', 'nothing']


In [77]:
#lets tokenize raw_text

processed=re.split(r'(/|\s)',raw_text)
processed=[i for i in processed if i.strip()]
print(processed[:99])

['University', 'Name:', 'Adama', 'Science', 'and', 'Technology', 'University', '(ASTU)', 'or', 'also', 'called', 'Astu', 'Alternative', 'Names:ASTU', 'Location:', 'City:', 'Adama', '/', 'Nazret', 'Region:', 'Oromia', 'Country:', 'Ethiopia', 'Approx.', 'Coordinates:', '8.53°N,', '39.28°E', 'Establishment', 'History:', '1993:', 'Founded', 'as', 'Nazareth', 'Technical', 'College', '(NTC).', '2003:', 'Upgraded', 'to', 'Nazareth', 'College', 'of', 'Technical', 'Teacher', 'Education', '(NCTTE).', '2005:', 'Further', 'upgraded', 'to', 'Adama', 'University', '(AU).', '2011:', 'Designated', 'by', 'the', 'Ethiopian', 'Ministry', 'of', 'Education', 'as', 'a', 'Federal', 'University', 'of', 'Technology', 'and', 'renamed', 'Adama', 'Science', 'and', 'Technology', 'University', '(ASTU).', 'Type:', 'Public', 'University', 'Federal', 'University', 'of', 'Science', 'and', 'Technology', 'Mission', '(Core', 'Focus):', 'To', 'produce', 'highly', 'qualified', 'graduates', 'in', 'science,', 'engineering,', 

## Creating Token ID

In [79]:
#vocabulary is a list of token which are sorted and map with id sometime unique

all_words=sorted(set(processed))
print(len(all_words))

347


In [80]:
vocab={token:indx for indx,token in enumerate(all_words)}
print(vocab['University'])

178


In [70]:
class TokenizerV1:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={int:token for token,int in vocab.items()}
    def encode(self,text):
        processed=sorted(set(re.split(r'(/|\s)',text)))
        processed=[token for token in processed if token.strip()]
        id=[self.str_to_int[token] for token in processed]

        return id
    def decode(self,ids):
        text=' '.join([self.int_to_str[i] for i in ids])
        #replace space before the specified punctuations
        token=re.sub(r'([,.?]|\s)',r'\1',text)
        return token


In [71]:
tokenize=TokenizerV1(vocab)

text='University'
id=tokenize.encode(text)
print(id)


[178]


In [72]:
#what if we give a word not present in the vocab
#one reson llm should train on large dataset is to expand its abbility to represent huge vocab
tokenize=TokenizerV1(vocab)

text='feleke'
id=tokenize.encode(text)
print(id)

KeyError: 'feleke'

## Adding special context token 
# what if the word is not in vocab

In [88]:
all_tokens=sorted(processed)
print(all_tokens)
all_tokens.extend(['<|endoftext|>','<|unk|>'])
vocab={token:int for int,token in enumerate(all_tokens)}
print(vocab['<|unk|>'])



['(ASTU)', '(ASTU).', '(AU).', '(BSc,', '(Core', '(Core', '(ICT)', '(MSc,', '(NCTTE).', '(NTC).', '(Note:', '(Schools', '(Science,', '(e.g.,', '(for', '(often', '(often', '/', '/', '/', '/', '/', '/', '1993:', '2003:', '2005:', '2011:', '39.28°E', '8.53°N,', 'Academic', 'Academic', 'Academic', 'Accommodation:', 'Adama', 'Adama', 'Adama', 'Adama', 'Adama', 'Adama', 'Address:', 'Administration,', 'Affairs,', 'Aligned', 'Alternative', 'Applied', 'Applied', 'Applied', 'Approx.', 'Architecture,', 'Areas', 'Astu', 'B.Tech)', 'Biology,', 'Biosciences', 'Biotechnology', 'Board.', 'Buildings:', 'Campus', 'Campus:', 'Centers:', 'Centers:', 'Central', 'Chemical', 'Chemistry,', 'City:', 'Civil', 'Collaborations:', 'Collaborative', 'College', 'College', 'Colleges', 'Communication', 'Computer', 'Computer', 'Computing', 'Construction', 'Contributes', 'Coordinates:', 'Country:', 'Designated', 'Dining', 'Dormitories.', 'Draws', 'Education', 'Education', 'Electrical', 'Energy', 'Engineering', 'Engineeri

In [84]:
class TokenizerV2:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={int:token for token,int in vocab.items()}
    def encode(self,text):
        processed=sorted(set(re.split(r'(/|\s)',text)))
        processed=[token for token in processed if token.strip()]
        processed=[ token if token in self.str_to_int
                          else '<|unk|>' for token in processed]
        id=[self.str_to_int[token] for token in processed]

        return id
    def decode(self,ids):
        text=' '.join([self.int_to_str[i] for i in ids])
        #replace space before the specified punctuations
        token=re.sub(r'([,.?]|\s)',r'\1',text)
        return token

In [None]:
tokenizer=TokenizerV2(vocab)
text1='hey feleke'
text2='hello'
text='<|endoftext|>'.join((text1,text2))



tokenizer.encode(text1)
id=tokenizer.encode(text1)
tokenizer.decode(id)

#gpt models only used <|endoftext|> only

'<|unk|> <|unk|>'