# Break the data into word tokens

In [1]:
with open('data/astu overview.txt','r') as f:
    raw_text=f.read()
print(len(raw_text))

4358


In [2]:
import re
text=raw_text
result=re.split(r'([./]|\s)',text)
#remove whitespace
result=[i.strip(' ') for i in result if i.strip()]
print(result[:4])
print(len(result))
result.append('<|unk|>')
print(len(result))

['University', 'Name:', 'Adama', 'Science']
598
599


# Tokenization

In [3]:
vocab={token:id for id,token in enumerate(sorted(set(result)))}

print(len(vocab))
print(vocab)

346
{'(ASTU)': 0, '(AU)': 1, '(BSc,': 2, '(Core': 3, '(ICT)': 4, '(MSc,': 5, '(NCTTE)': 6, '(NTC)': 7, '(Note:': 8, '(Schools': 9, '(Science,': 10, '(e': 11, '(for': 12, '(often': 13, ')': 14, ',': 15, '.': 16, '/': 17, '1993:': 18, '2003:': 19, '2005:': 20, '2011:': 21, '28°E': 22, '39': 23, '53°N,': 24, '8': 25, '<|unk|>': 26, 'Academic': 27, 'Accommodation:': 28, 'Adama': 29, 'Address:': 30, 'Administration,': 31, 'Affairs,': 32, 'Aligned': 33, 'Alternative': 34, 'Applied': 35, 'Approx': 36, 'Architecture,': 37, 'Areas': 38, 'Astu': 39, 'B': 40, 'Biology,': 41, 'Biosciences': 42, 'Biotechnology': 43, 'Board': 44, 'Buildings:': 45, 'Campus': 46, 'Campus:': 47, 'Centers:': 48, 'Central': 49, 'Chemical': 50, 'Chemistry,': 51, 'City:': 52, 'Civil': 53, 'Collaborations:': 54, 'Collaborative': 55, 'College': 56, 'Colleges': 57, 'Communication': 58, 'Computer': 59, 'Computing': 60, 'Construction': 61, 'Contributes': 62, 'Coordinates:': 63, 'Country:': 64, 'Designated': 65, 'Dining': 66, 'D

In [4]:
class TokenizerV1:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={int:token for token,int in vocab.items()}
    def encode(self,text):
        processed=sorted(set(re.split(r'([./]|\s)',text)))
        processed=[token for token in processed if token.strip()]
        processed=[item if item in self.str_to_int
                    else '<|unk|>' for item in processed]
        id=[self.str_to_int[token] for token in processed]

        return id
    def decode(self,ids):
        text=' '.join([self.int_to_str[i] for i in ids])
        #replace space before the specified punctuations
        token=re.sub(r'([,.?]|\s)',r'\1',text)
        return token


In [5]:
tokenizer=TokenizerV1(vocab)
text='Adam'
tokenizer.encode(text)
tokenizer.decode([26])

#disadvantage of this tokenizer is that it fails if the word not in vocab


'<|unk|>'

# Byte Pair tokenization

In [6]:
import tiktoken

In [7]:
tokenizer=tiktoken.get_encoding('gpt2')
#pre built open ai vocab

ConnectionError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /gpt-2/encodings/main/vocab.bpe (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fae36866a70>: Failed to resolve 'openaipublic.blob.core.windows.net' ([Errno -3] Temporary failure in name resolution)"))

In [None]:
text=('ekkk'
      'a')
ids=tokenizer.encode(raw_text,allowed_special={'<|endoftext|>'})
print(len(ids))
token=tokenizer.decode([21009])
print(token)

908
University


# Creating Input Target Pair

In [None]:
#context size is how many token the llm sees to predict the next token
#we use pytorch Dataset and DataLoader

context_size=4
x=ids[:context_size]
y=ids[1:context_size+1]
print(x)
print(y)

[21009, 6530, 25, 1215]
[6530, 25, 1215, 1689]


In [None]:
for i in range(1,context_size+1):
    context=ids[:i]
    target=ids[i]
    print(context,'--->',target)


[21009] ---> 6530
[21009, 6530] ---> 25
[21009, 6530, 25] ---> 1215
[21009, 6530, 25, 1215] ---> 1689


In [None]:
for i in range(1,context_size+1):
    context=ids[:i]
    target=ids[i]
    print(tokenizer.decode(context),'--->',tokenizer.decode([target]))


University --->  Name
University Name ---> :
University Name: --->  Ad
University Name: Ad ---> ama


# Use DataLoader to create input target pair

In [None]:

from torch.utils.data import Dataset, DataLoader


In [None]:
class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids=[]
        self.target_ids=[]
        self.max_length=max_length
        self.stride=stride
    
    #tokenize the entire text
        token_ids=tokenizer.encode(text)

    #use sliding window to chunk the token into overlapping sequence
        for i in range(0,len(token_ids)-self.max_length,self.stride):
            input_token_id=token_ids[i:i+self.max_length]
            target_token_id=token_ids[i+1:i+self.max_length+1]
            #self.input_ids.append(torch.tensor(input_token_id))
            #self.target_ids.append(torch.tensor(target_token_id))
            self.input_ids.append([input_token_id])
            self.target_ids.append([target_token_id])
            

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self,id):
        return self.input_ids[id],self.target_ids[id]




In [None]:
def create_dataloader(txt,batch_size=1,max_length=4,stride=1,shuffle=True,drop_last=True,num_workers=0):

    #create dataset aka input target pair
    dataset=GPTDatasetV1(txt,tokenizer,max_length,stride)

    #create dataloader
    dataloader=DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    print(dataloader)
    return dataloader

In [None]:
dataloader=create_dataloader(raw_text,batch_size=1,max_length=4,stride=1,shuffle=False)

data_iter=iter(dataloader)
first_batch=next(data_iter)

print(first_batch) #batch specifys how many input target pair taken at once eg 2 batch 2 from input id and 2 from target id



<torch.utils.data.dataloader.DataLoader object at 0x7fb776a42c20>


StopIteration: 

# Creating Token Embedding

In [None]:
vocab_size=len(ids)
output_dim=6

embedding_layer=torch.nn.Embedding(vocab_size,output_dim)
print(embedding_layer.weight)
#embeding layer updated during training

In [None]:


ataloader=create_dataloader(raw_text,batch_size=8,max_length=4,stride=1,shuffle=False)

data_iter=iter(dataloader)
input,target=next(data_iter)

print('Token',input) #batch specifys how many input target pair taken at once eg 2 batch 2 from input id and 2 from target id
print('Input Shape',input.shape) 

In [None]:
#pass the input to embedding layers
token_embeddings=embedding_layer(input)
print(token_embeddins.size)

# Positional Embedding

In [None]:
#positional embedding vector have same dimension as token embedding
#we used aabsolute positional encodding
context_length=vocab_size
pos_embedding_layer=torch.nn.Embedding(context_length,output_dim)

#input to positional embedding is a place holder sequence 0,1,2.. upto context length-1 in this case we use integer because position of token specified by index aka integer

pos_embeddings=pos_embedding_layer(torch.arange(context_length))

In [None]:
input_embedding=token_embeddings+pos_embeddings
print(input_embedding.size)

#input embedding is the final input we give to the llm training