# Search Engine

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import os
def time_elapsed(s):
    '''
    Function to establish baselines for time needed to setup a search engine
    '''
    print("Time Elapsed: {} seconds".format(round(time.time() - s, 4)))

In [3]:
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torch

import PIL
import pickle
import os
import traceback
import json
import logging
import numpy as np

from search import SearchEngine
from networks import FeatureExtractor


## Initialize SearchEngine

In [4]:
save_directory = './embeddings'
search_engine = SearchEngine(["text", "image"], save_directory = save_directory, verbose = True)

## Add models

In [5]:
resnet152_15 = pickle.load(open("pickles/models/entire_nuswide_model_15.p", "rb"))
search_engine.add_model(
    name = "ResNet152_15", 
    modalities = ["image","text"], 
    embedding_nets = [resnet152_15.modalityOneNet, resnet152_15.modalityTwoNet],
    input_dimensions= [(2048,), (300,)], 
    output_dimension = 64, 
    desc = "ResNet152 trained with 15 epochs")
search_engine.models["ResNet152_15"].add_preprocessing("image", FeatureExtractor("resnet152").get_embedding)

In [6]:
resnet152_5 = pickle.load(open("pickles/models/entire_nuswide_model_5.p", "rb"))
search_engine.add_model(
    name = "ResNet152_5", 
    modalities = ["image","text"], 
    embedding_nets = [resnet152_5.modalityOneNet, resnet152_5.modalityTwoNet],
    input_dimensions= [(2048,), (300,)], 
    output_dimension = 64, 
    desc = "ResNet152 trained with 5 epochs")
search_engine.models["ResNet152_5"].add_preprocessing("image", FeatureExtractor("resnet152").get_embedding)

In [7]:
resnet18_5 = pickle.load(open("pickles/models/entire_nuswide_model_5-18.p", "rb"))
search_engine.add_model(
    name = "ResNet18_5", 
    modalities = ["image","text"], 
    embedding_nets = [resnet18_5.modalityOneNet, resnet18_5.modalityTwoNet],
    input_dimensions= [(512,), (300,)], 
    output_dimension = 64, 
    desc = "ResNet18 trained with 5 epochs")
search_engine.models["ResNet18_5"].add_preprocessing("image", FeatureExtractor("resnet18").get_embedding)

## Create Datasets

In [8]:
image_directory = 'data/Flickr'
image_from_idx = [i[0] for i in ImageFolder(image_directory).samples]

In [9]:
s = time.time()

image_data18 = np.array([])
directory_18 = "data/nuswide_features/resnet18/"
filenames = sorted(["{}/{}".format(directory_18, filename) 
                    for filename in os.listdir(directory_18) if filename[-3:] == "npy"])
for filename in filenames:
    image_data18 = np.append(image_data18, np.load(filename,))
image_data18.resize(len(image_data18) // 512, 512)
image_data18 = torch.from_numpy(image_data18).cuda().float()
time_elapsed(s)

Time Elapsed: 23.2125 seconds


In [None]:
s = time.time()

image_data152 = np.array([])
directory_152 = "data/nuswide_features/resnet152/"
filenames = sorted(["{}/{}".format(directory_152, filename) 
                    for filename in os.listdir(directory_152) if filename[-3:] == "npy"])
for filename in filenames:
    image_data152 = np.append(image_data152, np.load(filename).astype('float32'))
image_data152.resize(len(image_data152) // 2048, 2048)
image_data152 = torch.from_numpy(image_data152).cuda().float()

time_elapsed(s)

In [None]:
s = time.time()

FAST_TEXT = pickle.load(open("pickles/word_embeddings/word_embeddings_tensors.p", "rb"))
fast_text = FAST_TEXT
text_from_idx = [None] * len(fast_text)
text_data = [None] * len(fast_text)
for idx, (key, value) in enumerate(fast_text.items()):
    text_from_idx[idx] = key
    text_data[idx] = (value, idx)

time_elapsed(s)

## Build DataLoaders

In [None]:
batch_size = 128

image18_dataloader = DataLoader(image_data18, batch_size = batch_size)
image152_dataloader = DataLoader(image_data152, batch_size = batch_size)
text_dataloader = DataLoader(text_data, batch_size = batch_size)

In [None]:
search_engine.add_dataset(
    name = "nus-wide_18", 
    data = image18_dataloader, 
    targets = image_from_idx, 
    modality = "image", 
    dimension = (512,))

In [None]:
search_engine.add_dataset(
    name = "nus-wide_152", 
    data = image152_dataloader, 
    targets = image_from_idx, 
    modality = "image", 
    dimension = (2048,))

In [None]:
search_engine.add_dataset(
    name = "fast_text", 
    data = text_dataloader, 
    targets = text_from_idx, 
    modality = "text", 
    dimension = (300,))

## Build Indexes

In [None]:
search_engine.build_index(
    dataset_name = "nus-wide_18",
    model_name = "ResNet18_5",
    binarized = False,
    load_embeddings = False,
    step_size = 1000)

In [None]:
search_engine.build_index(
    dataset_name = "nus-wide_152",
    model_name = "ResNet152_5",
    binarized = False,
    load_embeddings = False,
    step_size = 1000)

In [None]:
search_engine.build_index(
    dataset_name = "nus-wide_152",
    model_name = "ResNet152_15",
    binarized = False,
    load_embeddings = False,
    step_size = 1000)

In [None]:
search_engine.build_index(
    dataset_name = "fast_text",
    model_name = "ResNet18_5",
    binarized = False,
    load_embeddings = False,
    step_size = 1000)

In [None]:
search_engine.build_index(
    dataset_name = "fast_text",
    model_name = "ResNet152_5",
    binarized = False,
    load_embeddings = False,
    step_size = 1000)

In [None]:
search_engine.build_index(
    dataset_name = "fast_text",
    model_name = "ResNet152_15",
    binarized = False,
    load_embeddings = False,
    step_size = 1000)