# Kata 1: needle in a haystack

Another common use (some would say, the primary use) of STAC is to search for things.
Let's compare the performance of a bunch of different searches.

In [5]:
from pystac_client import Client

from labs_375 import STAC_FASTAPI_GEOPARQUET_URI, STAC_FASTAPI_PGSTAC_URI, Timer

LONGMONT = {"type": "Point", "coordinates": [-105.1019, 40.1672]}

geoparquet_client = Client.open(STAC_FASTAPI_GEOPARQUET_URI)
pgstac_client = Client.open(STAC_FASTAPI_PGSTAC_URI)

with Timer() as timer:
    items = list(
        geoparquet_client.search(
            collections=["naip"], intersects=LONGMONT
        ).items_as_dicts()
    )
    timer.report(items)

with Timer() as timer:
    items = list(
        pgstac_client.search(collections=["naip"], intersects=LONGMONT).items_as_dicts()
    )
    timer.report(items)


Retrieved 1 in 0.19s (5.32 items/s)
Retrieved 1 in 0.09s (10.80 items/s)


Intersects is pretty slow, let's see how we do on `id`.
We'll first do a full scan to get the first and last IDs.

In [6]:
items = list(
    geoparquet_client.search(
        collections=["naip"], max_items=10000, limit=10000
    ).items_as_dicts()
)
first_id = items[0]["id"]
last_id = items[-1]["id"]

with Timer() as timer:
    items = list(
        geoparquet_client.search(collections=["naip"], ids=[first_id]).items_as_dicts()
    )
    timer.report(items)

with Timer() as timer:
    items = list(
        geoparquet_client.search(collections=["naip"], ids=[last_id]).items_as_dicts()
    )
    timer.report(items)

with Timer() as timer:
    items = list(
        pgstac_client.search(collections=["naip"], ids=[first_id]).items_as_dicts()
    )
    timer.report(items)

with Timer() as timer:
    items = list(
        pgstac_client.search(collections=["naip"], ids=[last_id]).items_as_dicts()
    )
    timer.report(items)

Retrieved 1 in 0.25s (4.06 items/s)
Retrieved 1 in 0.26s (3.90 items/s)
Retrieved 1 in 0.11s (8.99 items/s)
Retrieved 1 in 0.07s (13.42 items/s)


Ok, not as good as **pgstac** but not terrible.
Let's try a cql2 filter.

In [8]:
from cql2 import Expr

# with Timer() as timer:
#     items = list(
#         geoparquet_client.search(
#             collections=["naip"], filter="naip:year = '2022'"
#         ).items_as_dicts()
#     )
#     timer.report(items)

expr = Expr("naip:year = '2022'").to_json()
with Timer() as timer:
    items = list(
        pgstac_client.search(
            collections=["naip"], filter=expr
        ).items_as_dicts()
    )
    timer.report(items)


Retrieved 279 in 5.12s (54.44 items/s)
