## Download Data

In [1]:
!git clone https://github.com/cryanwashere/Search-Engine.git

Cloning into 'Search-Engine'...
remote: Enumerating objects: 918, done.[K
remote: Counting objects: 100% (918/918), done.[K
remote: Compressing objects: 100% (902/902), done.[K
remote: Total 918 (delta 13), reused 910 (delta 9), pack-reused 0[K
Receiving objects: 100% (918/918), 8.13 MiB | 16.51 MiB/s, done.
Resolving deltas: 100% (13/13), done.


In [1]:
%cd Search-Engine

/content/Search-Engine


In [3]:
!python get_urls.py

In [4]:
!mkdir data
!mkdir data/images
!mkdir data/pages

In [None]:
!python download_data.py

## Setup Multi-Modal model

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
import torchvision.transforms as transforms
from torch.utils.data import Dataset
import re
import os
import matplotlib.pyplot as plt
import random

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cpu = torch.device("cpu")

device

device(type='cuda')

In [6]:
class TextEncoder(nn.Module):
  def __init__(self, dim, vocab_size, max_seq_len):
    super(TextEncoder, self).__init__()

    self.embedding = nn.Embedding(vocab_size, dim)

    self.positional_embedding = nn.Parameter(torch.zeros(1, max_seq_len, dim))
  
  def forward(self, x):

    # x should be batches of tokens

    x = self.embedding(x)

    x = x + self.positional_embedding[:,:x.shape[1],:]

    return x

sample_model = TextEncoder(64, 10, 5)

sample_tokens = torch.tensor([[1,2,3,4]])
sample_model(sample_tokens).shape

torch.Size([1, 4, 64])

In [7]:
class ImageEncoder(nn.Module):
  def __init__(self, dim, input_size, patch_size):
    super(ImageEncoder, self).__init__()

    self.dim = dim

    scale = dim ** -0.5
    self.conv1 = nn.Conv2d(in_channels=3, out_channels=dim, kernel_size=patch_size, stride=patch_size, bias=False)
    self.positional_embedding = nn.Parameter(scale * torch.randn((input_size // patch_size) ** 2, dim))

  def forward(self, x):

    # x.shape:
    # [ batch_size, num_images, rgb, width, height ]

    batch_size, num_images, rgb, width, height = x.shape
    
    x = x.reshape(batch_size * num_images, rgb, width, height) 

    # x.shape:
    # [ batch_size * num_images, rgb, width, height ]

    x = self.conv1(x)

    batch_image_prod, dim, grid_w, grid_h = x.shape

    x = x.reshape(batch_size, num_images, dim, grid_w, grid_h)

    # x.shape:
    # [ batch_size, num_images, dim, grid_size, grid_size ]
    
    x = x.reshape(batch_size, num_images, self.dim, -1)

    # x.shape:
    # [ batch_size, num_images, dim, grid_size ** 2 ]

    x = x.permute(0, 1, 3, 2)

    # x.shape:
    # [ batch_size, num_images, grid_size ** 2, dim ]

    # the position of each image in the web page is irrelevant, however
    # this allows the model to encode the position of each patch 
    # on each image 

    x = x + self.positional_embedding

    batch_size, num_images, image_grid, dim = x.shape

    x = x.reshape(batch_size, num_images * image_grid, dim)

    return x


sample_model = ImageEncoder(64, 224, 32)

sample_input = torch.zeros(8, 4, 3, 224, 224)
sample_model(sample_input).shape

torch.Size([8, 196, 64])

In [8]:
class WebPageFeatureExtractor(nn.Module):
  def __init__(self, dim, num_heads, num_layers, hidden_dim, dropout, max_text_seq_len, image_size, vocab_size):
    super(WebPageFeatureExtractor, self).__init__()

    self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(dim, num_heads, hidden_dim, dropout),
            num_layers
    )
        
    scale = dim ** -0.5
    self.class_embedding = nn.Parameter(scale * torch.randn(dim))

    self.img_encoder = ImageEncoder(dim, image_size, 32)
    self.txt_encoder = TextEncoder(dim, vocab_size, max_text_seq_len)

  def forward(self, img, txt):

    # img:
    # a tensor of any amount of images
    # img.shape:
    # [ batch_size, num_images, rgb, image_size, image_size]

    img_embeddings = self.img_encoder(img)

    # img_embeddings.shape:
    # [ batch_size, grid_size ** 2, dim ]


    # txt:
    # a tensor of tokens
    # txt.shape
    # [ batch_size, seq_len ]

    txt_embeddings = self.txt_encoder(txt)

    # txt_embeddings.shape
    # [ batch_size, seq_len, dim ]


    batch_size = img_embeddings.shape[0]
    class_embeddings = self.class_embedding + torch.zeros(batch_size, 1, self.class_embedding.shape[-1]).to(self.class_embedding.device)
    
    x = torch.cat(
        [ class_embeddings, img_embeddings, txt_embeddings ],
        dim = 1
    )

    x = self.transformer(x)

    x = x[:, 0, :]

    return x

model = WebPageFeatureExtractor(
    dim = 512,
    num_heads = 8,
    num_layers = 8,
    hidden_dim = 1024, 
    dropout = 0.2, 
    max_text_seq_len = 256,
    image_size = 224,
    vocab_size = 30522
).to(device)

sample_text = torch.tensor([[1,2,3,4,5,6,7,8]]).to(device)
sample_images = torch.zeros(1, 4, 3, 224, 224).to(device)

model(sample_images, sample_text).shape

print(f"model uses {sum(p.numel() for p in model.parameters()) / 1000000} million parameters")

torch.Size([1, 512])

## Data

In [9]:
image_transform = transforms.Compose([
    transforms.ToTensor(),                                                       # Convert the image to a tensor
    transforms.Resize((224, 224), antialias=True),                               # Resize the image to (224, 224)
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize the image
])

def shuffle_tensor(tensor, dim):
    # Generate a random permutation of indices
    indices = torch.randperm(tensor.size(dim))

    # Shuffle the tensor along the specified dimension using the indices
    shuffled_tensor = tensor.index_select(dim, indices)

    return shuffled_tensor

def validate_datapoint(path):
  with open(os.path.join("data/pages",path), "r") as f:
    text = f.read()

  split_data = text.split('IMAGES_DONE')

  image_paths, text = split_data[0], split_data[1]


  images_tensor = torch.ones(1, 3, 224, 224)

  image_paths = image_paths.split('\n')
  image_paths = [path for path in image_paths if path != '']

  if len(image_paths) < 6:
    return False

  if len(text.split(' ')) < 512:
    return False

  #tokens = tokenizer.encode(text, return_tensors='pt').squeeze()
  
  return True



# Define the dataset class.
class WebContentDataset(Dataset):

    def __init__(self, paths):
        self.paths = paths

        self.max_images = 2
        self.max_tokens = 256

    def __len__(self):
      return len(self.paths)

    def __getitem__(self, idx):

      with open(os.path.join("data/pages",self.paths[idx]), "r") as f:
        text = f.read()

      split_data = text.split('IMAGES_DONE')

      image_paths, text = split_data[0], split_data[1]


      images_tensor = torch.ones(1, 3, 224, 224)

      image_paths = image_paths.split('\n')
      image_paths = [path for path in image_paths if path != '']

      random.shuffle(image_paths)
      image_paths = image_paths[:self.max_images * 2]

      for path in image_paths:
        image_tensor = image_transform(Image.open(path)).unsqueeze(0)
        images_tensor = torch.cat([images_tensor, image_tensor], dim=0)
      
      images_tensor = images_tensor[1:, :, :]

      images_tensor = shuffle_tensor(images_tensor, 0)

      tokens = tokenizer.encode(text, return_tensors='pt').squeeze()


      num_tokens = tokens.shape[0]
      num_images = images_tensor.shape[0]

      assert num_tokens >= self.max_tokens * 2
      assert num_images >= self.max_images * 2

      
      img_seq_1 = images_tensor[:self.max_images, :, :, :]
      img_seq_2 = images_tensor[self.max_images : self.max_images * 2, :, :, :]

      tok_seq_1 = tokens[:self.max_tokens]
      tok_seq_2 = tokens[self.max_tokens : self.max_tokens * 2]
      

      return (img_seq_1, tok_seq_1), (img_seq_2, tok_seq_2)

file_paths = os.listdir("data/pages")
file_paths = [path for path in file_paths if validate_datapoint(path)]
ds = WebContentDataset(file_paths)
len(ds)#, ds[0]

574

In [10]:
def visualize_datapoint(images, tokens):


  # Convert the tensor to numpy array
  images = images.transpose(1,3)

   # Apply softmax to convert logits to probabilities
  #images_probs = torch.sigmoid(images)

  # Scale the probabilities to a range of [0, 1]
  #images_scaled = (images_probs - torch.min(images_probs)) / (torch.max(images_probs) - torch.min(images_probs))

  images_np = images.numpy()

  # Calculate the number of rows and columns for subplots
  num_images = images_np.shape[0]
  num_rows = int(num_images ** 0.5)
  num_cols = int((num_images + num_rows - 1) / num_rows)

  # Create a figure and subplots
  fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 10))

  # Plot each image
  for i, ax in enumerate(axes.flat):
      if i < num_images:
          ax.imshow(images_np[i])
          ax.axis('off')
      else:
          ax.axis('off')

  # Adjust the layout and display the plot
  plt.tight_layout()
  plt.show()

  print(tokenizer.decode(tokens))

#dp_1, dp_2 = ds[3]

#visualize_datapoint(*dp_1)

In [11]:
batch_size = 32

dl = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=2)

In [12]:
criterion = torch.nn.CrossEntropyLoss()

parameters = list(model.parameters()) 
optim = torch.optim.Adam(parameters, lr = 0.00001)

Inference the model, using it for search

In [13]:
def cosine_similarity(vector1, vector2):

    # Normalize the tensors to unit length
    tensor1 = F.normalize(vector1, dim=0)
    tensor2 = F.normalize(vector2, dim=0)

    # Calculate the cosine similarity between the two tensors
    similarity = F.cosine_similarity(tensor1, tensor2, dim=0)

    return similarity.item()

In [14]:
def open_data(path : str):

  with open(path, "r") as f:
    text = f.read()

  split_data = text.split('IMAGES_DONE')

  image_paths, text = split_data[0], split_data[1]
  

  images_tensor = torch.ones(1, 3, 224, 224)

  image_paths = image_paths.split('\n')
  
  
  random.shuffle(image_paths)
  image_paths = image_paths[:2]

  image_paths = filter(lambda x : x != '', image_paths)
  for path in image_paths:
    image_tensor = image_transform(Image.open(path)).unsqueeze(0)
    images_tensor = torch.cat([images_tensor, image_tensor], dim=0)
  
  images_tensor = images_tensor[1:, :, :]

  images_tensor = images_tensor.unsqueeze(0)
  #images_tensor = shuffle_tensor(images_tensor, 0).unsqueeze(0)[:,:2,:,:,:]

  tokens = tokenizer.encode(text, return_tensors='pt')[:,:256]

  return images_tensor, tokens

def encode_data(path : str) -> torch.tensor:

  images, tokens = open_data(path)

  images, tokens = images.to(device), tokens.to(device)

  #print(images, tokens)

  vec = model(images, tokens)

  return vec


def get_search_vecs():

  search_dict = dict()

  for path in search_paths:
    vec = encode_data(path)

    page_name = path.split('/')[-1]
    search_dict[page_name] = vec
  
  return search_dict

def get_search_scores():
  vec_dict = get_search_vecs()

  query_vec = vec_dict["query.txt"].squeeze()

  for key in vec_dict.keys():
    key_vec = vec_dict[key].squeeze()

    similarity = cosine_similarity(query_vec, key_vec)

    vec_dict[key] = similarity
  
  return vec_dict



search_paths = os.listdir('sample_data/pages')
search_paths = [f"sample_data/pages/{page}" for page in search_paths][:5]
search_paths.append("sample_data/pages/query.txt")
print(search_paths)
get_search_scores()

['sample_data/pages/flower.txt', 'sample_data/pages/alexander_gerald.txt', 'sample_data/pages/learning_how_to_learn.txt', 'sample_data/pages/piano.txt', 'sample_data/pages/nature.txt', 'sample_data/pages/query.txt']


Token indices sequence length is longer than the specified maximum sequence length for this model (10489 > 512). Running this sequence through the model will result in indexing errors


{'flower.txt': 0.7702924013137817,
 'alexander_gerald.txt': 0.7663944363594055,
 'learning_how_to_learn.txt': 0.7842341065406799,
 'piano.txt': 0.7998403906822205,
 'nature.txt': 0.7305622696876526,
 'query.txt': 1.0000001192092896}

In [15]:
!pip install neptune

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
import neptune

run = neptune.init_run(
    project="cjryanwashere/Search-Engine",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2ODc4MDgwNS0wYzEwLTQ4MWEtOTJlOS0wODAxY2JmNmIzYmEifQ==",
)  # your credentials


  run = neptune.init_run(


https://app.neptune.ai/cjryanwashere/Search-Engine/e/SEARCH-6


In [17]:
def write_search_scores():
  scores = get_search_scores()

  for key in scores.keys():
    run[f"cosine_similarity/{key}"].append(scores[key])

write_search_scores()

In [None]:
for epoch in range(100):
  print(f"epoch {epoch}")
  for i, batch in enumerate(dl):

    seq_1, seq_2 = batch
    (img_1, tok_1), (img_2, tok_2) = seq_1, seq_2
    img_1, tok_1, img_2, tok_2 = img_1.to(device), tok_1.to(device), img_2.to(device), tok_2.to(device)

    out_1, out_2 = model(img_1, tok_1), model(img_2, tok_2)
    #out_1, out_2 = model(*seq_1), model(*seq_2)

    similarity_matrix = out_1 @ out_2.T
    label = torch.eye(similarity_matrix.shape[0]).to(device)

    loss = criterion(similarity_matrix, label)
    run["train/loss"].append(loss.item())

    loss.backward()

    optim.step()

    optim.zero_grad()
    
    print(".",end='')

    if i % 20 == 0 and i != 0:
      write_search_scores()
      print("\n writing search scores...")

............................................................................................................................................................................................

In [42]:
run.stop()

Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/cjryanwashere/Search-Engine/e/SEARCH-4/metadata
