# A Demo with DocumentArray

Import DocumentArray:

In [None]:
from docarray import Document, DocumentArray

Some configs:

In [None]:
DATA_DIR = "./data"
DATA_PATH = f"{DATA_DIR}/*.jpg"
MAX_DOCS = 1000
QUERY_IMAGE = "./query.jpg" # image we'll use to search with
PLOT_EMBEDDINGS = False # Really useful but have to manually stop it to progress to next cell

# Toy data - If data dir doesn't exist, we'll get data of ~800 fashion images from here
TOY_DATA_URL = "https://github.com/alexcg1/neural-search-notebooks/raw/main/fashion-search/data.zip?raw=true"

In [None]:
import os

if not os.path.isdir(DATA_DIR) and not os.path.islink(DATA_DIR):
    print(f"Can't find {DATA_DIR}. Downloading toy dataset")
    !wget "$TOY_DATA_URL" -O data.zip
    !unzip -q data.zip # Don't print out every darn filename
    !rm -f data.zip
else:
    print(f"Nothing to download. Using {DATA_DIR} for data")

Use `.from_files` to quickly load them into a `DocumentArray`

In [None]:
docs = DocumentArray.from_files(DATA_PATH, size=MAX_DOCS)
print(f"{len(docs)} Documents in DocumentArray")

Preview the images


In [None]:
docs.plot_image_sprites()

Convert to tensor, normalize so they're all similar enough

In [None]:
def preproc(d: Document):
    return (d.load_uri_to_image_tensor()  # load
             .set_image_tensor_shape((80, 60))  # ensure all images right size (dataset image size _should_ be (80, 60))
             .set_image_tensor_normalization()  # normalize color 
             .set_image_tensor_channel_axis(-1, 0))  # switch color axis for the PyTorch model later
    
docs.apply(preproc)
docs

Build the model. With bare ResNet50.

In [None]:
import torch
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
device

In [None]:
%pip install torchvision==0.11.2
import torchvision
model = torchvision.models.resnet50(pretrained=True)  # load ResNet50

In [None]:
docs.embed(model, device=device)

In [None]:
if PLOT_EMBEDDINGS:
    docs.plot_embeddings(image_sprites=True, image_source="uri")

Get the query document and do the same process as before.

In [None]:
# Download query doc
!wget https://github.com/alexcg1/neural-search-notebooks/raw/main/fashion-search/1_build_basic_search/query.jpg -O query.jpg

query_doc = Document(uri=QUERY_IMAGE)
query_doc.display()

In [None]:
# Throw the one Document into a DocumentArray, since that's what we're matching against
query_docs = DocumentArray([query_doc])

In [None]:
# Apply same preprocessing
query_docs.apply(preproc)

In [None]:
# ...and create embedding just like we did with the dataset
query_docs.embed(model, device=device) # If running on non-gpu machine, change "cuda" to "cpu"

Do the MATCH.

In [None]:
query_docs.match(docs, limit=9)

In [None]:
(DocumentArray(query_doc.matches, copy=True)
    .apply(lambda d: d.set_image_tensor_channel_axis(0, -1)
                      .set_image_tensor_inv_normalization())
    ).plot_image_sprites()

In [None]:
if PLOT_EMBEDDINGS:
    query_doc.matches.plot_embeddings(image_sprites=True, image_source="uri")
query_doc.matches.plot_embeddings(image_sprites=True, image_source="uri")