In [16]:
from datasets import load_dataset
ds = load_dataset('roneneldan/TinyStories')
ds.save_to_disk('./tinystories_dataset')

Repo card metadata block was not found. Setting CardData to empty.
Saving the dataset (4/4 shards): 100%|██████████| 2119719/2119719 [00:21<00:00, 100411.87 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 21990/21990 [00:00<00:00, 580748.57 examples/s]


In [18]:
ar = [1,2,4]
ar[1:2]

[2]

In [5]:
## reduce the sizey of tiny stories so the map function does not take so long!
from datasets import DatasetDict
import numpy as np
num_text = 5000
ds = DatasetDict({'train' : ds['train'].select(np.arange(num_text)), 'validation': ds['validation'].select(np.arange(num_text))})

In [6]:
## define the model and tokenizer function
## Mymodel definition
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Config, PreTrainedModel
import torch
from torch.nn import CrossEntropyLoss



class MyModel(PreTrainedModel):
    config_class = GPT2Config

    def __init__(self, config):
        super().__init__(config)
        self.encoder = GPT2Model(config)
        self.second_encoder = GPT2Model(config)
        self.decoder = GPT2LMHeadModel(config)

    def forward(self, input_ids, labels=None, attention_mask=None):
        encoder_outputs = self.encoder(input_ids)
        hidden_embedding = encoder_outputs.last_hidden_state[:,-1,:].unsqueeze(1)
        # just to obtain the hidden embeddings
        with torch.no_grad():
            decoder_hidden_inputs = self.second_encoder(input_ids, output_hidden_states=True).hidden_states[0]
        #hidden_embedding_dim = hidden_embedding.shape[2]
        updated_input = torch.cat((hidden_embedding, decoder_hidden_inputs), dim=1)
        logits = self.decoder(inputs_embeds=updated_input)['logits']
        logits = F.log_softmax(logits, dim=-1)
        shifted_prediction_scores = logits[:, 1:-1, :]
        
        labels[attention_mask == 0] = -100 
        labels = labels[:, 1:]
        loss_fct = CrossEntropyLoss()
        lm_loss = loss_fct(shifted_prediction_scores.contiguous().view(-1, self.config.vocab_size), labels.contiguous().view(-1))
        return {'loss': lm_loss, 'logits':logits[:,1:,:]}
    

## defining tokenizer
from transformers import AutoTokenizer
#tokenizer = AutoTokenizer.from_pretrained('google-t5/t5-small')
tokenizer = AutoTokenizer.from_pretrained('gpt2')

context_length = 512
def tokenize(element):
    #print('element is ', len(element['text']))
    #return {'input_ids': []}
    #print('len is ', ('</s>'.join(x) for x in element['text']).type)
    outputs = tokenizer(
        element['text'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
        padding=True
    )

    input_batch = []
    length_batch = []
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):
        num_tokens = sum(token != tokenizer.pad_token_id for token in input_ids) - 1
        #print('last id is ', input_ids[-1])
        if length <= context_length:
            input_batch.append(input_ids)
            length_batch.append(num_tokens)
    #print('batch length is ', len(input_batch))
    return {'input_ids': input_batch, 'length': length_batch}




In [7]:
## load the model
#checkpoint = './model3weights_2024-07-04--16:34:15'
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
#tokenizer
from mymodelnew2 import MyModel
checkpoint = 'posmodel3_weights_2024-07-29--17:41:44alaki'
model = MyModel.from_pretrained(checkpoint)


In [8]:
## don't run this unless you want t5 model instead!
## initializing T5 encoder model
### obtaining the average of the embeddings
from transformers import T5Tokenizer, T5EncoderModel
tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')
model = T5EncoderModel.from_pretrained('google-t5/t5-small')


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
## build the raw dataset
tokenized_dataset_raw = ds.map(tokenize, batched=True, remove_columns=ds['train'].column_names)
tokenized_dataset_raw.save_to_disk('./tinystories-validation-ctxlen=512')

Saving the dataset (1/1 shards): 100%|██████████| 5000/5000 [00:00<00:00, 50705.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5000/5000 [00:00<00:00, 38646.42 examples/s]


In [None]:
## subsample the raw dataset
import numpy as np
#num_text = 5000
tokenized_dataset = tokenized_dataset_raw['validation'].select(np.arange(num_text))
## move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
model.to(device)
#tokenized_dataset = {'input_ids': tokenized_dataset['input_ids'].to(device)} 

In [13]:
len(tokenized_dataset[0]['input_ids'])

512

In [None]:
## don't run this!!
## new dataset handling
from datasets import load_from_disk
tokenized_dataset_raw = load_from_disk('./model3-outputs-gpt2tokenizer')
tokenized_dataset = tokenized_dataset_raw['validation'].select(np.arange(num_text))
## changing model to cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


In [None]:
torch.cuda.empty_cache()
batch_size = 120
num_batch = int(num_text / batch_size)
all_hidden_embeddings = torch.empty(0, model.config.hidden_size).to(device)
with torch.no_grad():
    for i in range(num_batch):
        #print(len(model.encoder(torch.tensor(tokenized_dataset['input_ids'])[i*batch_size:(i+1)*batch_size,:].to(device))))
        #print('salam')
        hidden_embeddings = model.encoder(torch.tensor(tokenized_dataset['input_ids'])[i*batch_size:(i+1)*batch_size,:].to(device)).last_hidden_state[np.arange(batch_size), tokenized_dataset['length'][i*batch_size:(i+1)*batch_size], :] 
        all_hidden_embeddings = torch.cat((all_hidden_embeddings, hidden_embeddings), dim=0)
all_hidden_embeddings.shape
    #torch.cuda.empty_cache()
#all_hidden_embeddings = torch.tensor(all_hidden_embeddings)

In [1]:
import torch
mytensor = torch.tensor([[1,2],[3,4]])
mytensor.shape[0]

torch.Size([2, 2])

In [13]:
## don't run this unless you want to find embeddings using T5 encoder!
torch.cuda.empty_cache()
batch_size = 100
num_batch = int(num_text / batch_size)
all_hidden_embeddings = torch.empty(0, context_length, model.config.hidden_size).to(device)
with torch.no_grad():
    for i in range(num_batch):
        #print(len(model.encoder(torch.tensor(tokenized_dataset['input_ids'])[i*batch_size:(i+1)*batch_size,:].to(device))))
        #print('salam')
        hidden_embeddings = model.encoder(torch.tensor(tokenized_dataset['input_ids'])[i*batch_size:(i+1)*batch_size,:].to(device)).last_hidden_state
        all_hidden_embeddings = torch.cat((all_hidden_embeddings, hidden_embeddings), dim=0)
all_hidden_embeddings.shape
    #torch.cuda.empty_cache()
#all_hidden_embeddings = torch.tensor(all_hidden_embeddings)

torch.Size([5000, 700, 512])

In [None]:
#result = model.encoder(torch.tensor([tokenized_dataset['input_ids'][0]]).to('cuda')).last_hidden_state
#result[0][-1]

In [14]:
## run this only if you are finding the embeddings with T5 encoder!

## run this only if you are using average token embedding!
#alaki = torch.rand((3,4))
#condition = torch.unsqueeze(torch.arange(3), dim=0).repeat(2,1).t()
#condition

valid_len = torch.unsqueeze(torch.tensor(tokenized_dataset['length']), dim=1).repeat(1, context_length)
#print('len tokenized dataset is ', len(tokenized_dataset['input_ids']))
row_index = torch.unsqueeze(torch.arange(context_length), dim=0).repeat(len(tokenized_dataset['input_ids']), 1)
condition = row_index <= valid_len
condition = torch.unsqueeze(condition, dim=2).repeat(1,1,model.config.hidden_size)
#all_hidden_embeddings.shape
#condition.shape
all_hidden_embeddings_processed = torch.where(condition.to(device), all_hidden_embeddings, 0.0)
all_hidden_embeddings = torch.mean(all_hidden_embeddings_processed, dim=1)
all_hidden_embeddings.shape
#all_hidden_embeddings[valid_len]

torch.Size([5000, 512])

In [32]:
np_hidden_embeddings = np.array(all_hidden_embeddings.to('cpu')).astype('float32')
np_hidden_embeddings.shape

(4920, 768)

In [33]:
## pick one of the stories as the base for searech
#tokenized_dataset
base_index = 9
input_text = tokenizer.decode(tokenized_dataset['input_ids'][base_index], skip_special_tokens=True)
input_text
import faiss
index_flat = faiss.IndexFlatL2(model.config.hidden_size)
res = faiss.StandardGpuResources()
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
gpu_index_flat.add(np_hidden_embeddings)
k = 2
D, I = gpu_index_flat.search(np.expand_dims(np_hidden_embeddings[base_index], axis=0), k)
close_text = tokenizer.decode(tokenized_dataset['input_ids'][I[0][1]], skip_special_tokens=True)
print('close text index is ' + str(I[0][1]))
close_text

close text index is 771


'Once upon a time, there was a little girl named Lily. She liked to play outside in the sunshine. One day, she saw a butterfly. The butterfly was very pretty and had many colors. \n\nLily said, "Hello, butterfly! Can I play with you?" \n\nThe butterfly said, "Yes, you can play with me. But I can\'t stay too long. I will melt if I stay in the sun too much." \n\nLily said, "Oh no! I don\'t want you to melt. Let\'s go find some shade." \n\nThey found a tree with lots of shade. The butterfly said, "Thank you for finding a shady spot for me. You are very kind." \n\nLily smiled and said, "You\'re welcome. I like to help insects when they need it. I have some water available if you\'re thirsty." \n\nThe butterfly drank the water and then flew away. Lily was happy that she could help the butterfly. She went back to playing outside, feeling proud of herself for being kind to the little insect.'

In [34]:
print(all_hidden_embeddings[9])
torch.norm(all_hidden_embeddings[6] - all_hidden_embeddings[20])

tensor([ 1.6828e+00, -1.3115e+00,  6.2090e-01,  1.3430e+00,  2.1449e+00,
         1.0749e-01,  2.9974e-01, -1.7864e+00,  9.9046e-01,  1.0543e+00,
         2.7673e-01,  1.3266e+00,  8.0879e-01,  1.0688e+00,  4.0232e-01,
        -4.5541e-01,  1.4120e+00, -2.3848e-01,  2.9317e+00, -1.2302e+00,
         3.4106e-01,  1.5886e+00, -1.9384e-01,  4.9927e-01, -1.7283e+00,
         1.1958e-01, -1.1766e+00, -6.0489e-01, -2.6862e-01,  1.3256e+00,
        -3.7113e-01,  2.7773e-01,  3.7017e-01,  6.9553e-01,  1.2615e+00,
        -2.7678e-01,  7.2701e-01, -7.8779e-01,  9.7299e-01, -1.5636e-01,
         6.5418e-01, -9.2435e-01,  1.1597e+00, -1.3186e+00,  8.8128e-01,
         2.0889e-01, -9.1116e-01, -1.7272e-01, -3.9783e-01,  2.1831e+00,
         1.9208e+00, -9.6646e-01,  1.7172e-01,  1.1205e+00, -4.9456e-01,
         3.3881e-01,  1.6079e+00,  7.1630e-01, -2.9640e-01,  6.2130e-01,
        -8.5389e-01,  6.8214e-01, -1.2316e-01,  7.9154e-01, -4.7565e-01,
        -6.5235e-01,  1.1552e+00, -5.0727e-01,  1.5

tensor(40.3134, device='cuda:0')

In [35]:
import textwrap
def wrap(line):
    broken = textwrap.wrap(line,70, break_long_words=False)
    #print('broken is ', broken)
    return '\n'.join(broken)

w_input_text = wrap(input_text)
w_close_text = wrap(close_text)
print('INPUT TEXT IS: ', w_input_text)
print('CLOSE TEXT IS: ', w_close_text)

INPUT TEXT IS:  Once upon a time, there was a girl named Mia. Mia loved her jewelry.
She had a big box full of pretty things. She liked to wear them all
day. But at night, she had to sleep.  One day, Mia met a talking cat
named Tom. Tom was a tough cat, but he was nice. Tom said, "Hi, Mia! I
like your jewelry. Can I wear some too?" Mia said, "Yes, Tom. You can
wear my jewelry, but we have to give it back before we sleep."  So,
Mia and Tom played together. They wore the jewelry and had fun. They
pretended to be kings and queens. They laughed and danced. But soon,
the sun went down, and it was time for bed.  Mia said, "Tom, we must
give back the jewelry now. It's time to sleep." Tom gave back the
jewelry and said, "Thank you, Mia. I had fun today." They put the
jewelry back in the box and went to sleep. Mia and Tom were happy, and
they had sweet dreams.
CLOSE TEXT IS:  Once upon a time, there was a little girl named Lily. She liked to
play outside in the sunshine. One day, she saw a butt

In [6]:
### obtaining the average of the embeddings
from transformers import T5Tokenizer, T5EncoderModel
t5tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')
t5model = T5EncoderModel.from_pretrained('google-t5/t5-small')
input_ids = t5tokenizer(
    'Studies have been shown that owning a dog is good for you', return_tensors='pt'
).input_ids
outputs = t5model.encoder(input_ids)
last_hidden_states = outputs.last_hidden_state
last_hidden_states.shape


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


torch.Size([1, 15, 512])

tensor([0.5824, 0.1530, 0.1705])

In [4]:
torch.cuda.empty_cache()
batch_size = 20
num_batch = int(num_text / batch_size)
all_hidden_embeddings = torch.empty(0, model.config.hidden_size).to(device)
with torch.no_grad():
    for i in range(num_batch):
        #print(len(model.encoder(torch.tensor(tokenized_dataset['input_ids'])[i*batch_size:(i+1)*batch_size,:].to(device))))
        #print('salam')
        hidden_embeddings = t5model(torch.tensor(tokenized_dataset['input_ids'])[i*batch_size:(i+1)*batch_size,:].to(device)).last_hidden_state[np.arange(batch_size), tokenized_dataset['length'][i*batch_size:(i+1)*batch_size], :] 
        all_hidden_embeddings = torch.cat((all_hidden_embeddings, hidden_embeddings), dim=0)
all_hidden_embeddings.shape

NameError: name 'torch' is not defined

In [2]:
import nbformat as nbf
with open('knn-search.ipynb', 'r') as f:
    nb = nbf.read(f, as_version=4)
with open('./knn-search/two-similar-stories-found-within-500-stories.ipynb', 'w') as f:
    nbf.write(nb, f)