In [1]:
import sys
sys.path.append("..")
from src import metrics
from src import constant
from src.utils import get_device, set_seed, haversine
from src.datasets.mp16 import MP16Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import polars as pl
from transformers import AutoProcessor, AutoModel
from src.eval_s4 import merge_responses

  from .autonotebook import tqdm as notebook_tqdm
  from pkg_resources import packaging


In [None]:
set_seed(42)
device = get_device()
clip_model_name = "openai/clip-vit-large-patch14"
processor = AutoProcessor.from_pretrained(clip_model_name)

In [None]:
def collate_fn(batch):
    images = [b["image"] for b in batch]
    inputs = processor(images=images, return_tensors="pt")
    return inputs


In [20]:
df_ref = pl.read_csv("../datasets/mp16-reason-train.csv")
df_test = pl.read_csv("../datasets/mp16-reason-test.csv")
dataset = MP16Dataset(
    df_test,
    img_col="IMG_ID",
    img_base_path="../datasets/mp16-reason",
)
loader = DataLoader(
    dataset,
    batch_size=constant.BATCH_SIZE//4,
    collate_fn=collate_fn,
)

In [21]:
clip_model = AutoModel.from_pretrained(clip_model_name).to(device)
clip_model = clip_model.eval()

Loading weights: 100%|██████████| 590/590 [00:00<00:00, 2060.29it/s, Materializing param=visual_projection.weight]                                
[1mCLIPModel LOAD REPORT[0m from: openai/clip-vit-large-patch14
Key                                  | Status     |  | 
-------------------------------------+------------+--+-
text_model.embeddings.position_ids   | UNEXPECTED |  | 
vision_model.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [22]:
from qdrant_client import QdrantClient, models


client = QdrantClient(host="127.0.0.1", port=6333)

In [23]:
import torch
import numpy as np

prec_10 = []
prec_100 = []

embeddings = []
with torch.no_grad():
    for batch in tqdm(loader, desc="encode"):
        out = clip_model.get_image_features(**{k: v.to(device) for k, v in batch.items()})
        out = out.pooler_output.cpu()
        embeddings.append(out)
        # search_queries = [
        #     models.QueryRequest(query=embed, using="dense", limit=constant.TOPK, with_payload=True)
        #     for embed in out
        # ]
        # query_res = client.query_batch_points(
        #     collection_name="mp16-reason-clip", requests=search_queries
        # )
        # for ii, res in enumerate(query_res):
        #     points = [r.payload for r in res.points]
        #     ref_gps = np.array([(item["LAT"], item["LON"]) for item in points])
        #     gt_gps = df_test[ii]["LAT", "LON"].to_numpy().reshape(-1)
        #     distances = haversine(gt_gps, ref_gps)
        #     rank = np.argsort(distances)[::-1]
        #     reranked_ref_gps = ref_gps[rank]
        #     prec_10.append(metrics.precision_k(gt_gps, reranked_ref_gps, min_dist=250))
        #     prec_100.append(metrics.precision_k(gt_gps, reranked_ref_gps, k=100, min_dist=250))

        torch.cuda.empty_cache()
        del out

encode: 100%|██████████| 375/375 [01:48<00:00,  3.45it/s]


In [24]:
query_embeddings = torch.vstack(embeddings)

In [42]:
# similarity
ret_responses = []

for query in tqdm(query_embeddings.tolist(), desc="Similarity search"):
    res = client.query_points(
        "mp16-reason-clip",
        query=query,
        using="dense",
        limit=100,
    )
    res = [{**item.payload, "sim_score": item.score} for item in res.points]
    ret_responses.append(res)

Similarity search: 100%|██████████| 12000/12000 [01:30<00:00, 133.15it/s]


In [43]:
# eval distance
# gt_img_path = df["IMG_ID"].to_list()
gt_gps = df_test.select("LAT", "LON").to_numpy().tolist()
ret_gps = [[(r["LAT"], r["LON"]) for r in res] for res in ret_responses]
# distances = haversine(gt_gps, ret_gps).T

# reduce responses to top-5 only
# ret_responses = [res[:5] for res in ret_responses]

In [44]:
def haversine_np(gps1: list | tuple | np.ndarray, gps2: list | tuple | np.ndarray):
    if not isinstance(gps1, np.ndarray):
        gps1 = np.array(gps1)
    if not isinstance(gps2, np.ndarray):
        gps2 = np.array(gps2)

    gps1 = np.atleast_2d(gps1)
    gps2 = np.atleast_2d(gps2)

    lat1, lon1 = np.radians(gps1).T
    lat2, lon2 = np.radians(gps2).T

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))

    dist = 6371 * c
    if dist.size == 1:
        return dist.item()
    return dist

In [45]:
def precision_k(
    gt_gps: np.ndarray, ret_gps: np.ndarray, k: int = 10, min_dist: int = 50
):
    """Out of the top-K results, how many are actually outside the forbidden radius?"""
    distances = haversine_np(gt_gps, ret_gps).T
    return np.mean(distances[:, :k] >= min_dist)

In [54]:
np.array(gt_gps).shape, np.array(ret_gps).shape

((12000, 2), (12000, 100, 2))

In [55]:
metrics = {
    "precision@10": precision_k(gt_gps, ret_gps, k=10, min_dist=250).item(),
    "precision@100": precision_k(gt_gps, ret_gps, k=100, min_dist=250).item(),
}

metrics

{'precision@10': 0.5738083333333334, 'precision@100': 0.6649058333333333}

In [51]:
ret_responses[10]

[{'IMG_ID': '72_81_1347073991.jpg',
  'AUTHOR': '51585498@N00',
  'LAT': 49.759734,
  'LON': 6.644647,
  'S3_Label': 2.0,
  'S16_Label': 11.0,
  'S365_Label': 91.0,
  'Prob_indoor': 0.009086004939407,
  'Prob_natural': 0.1641019709768159,
  'Prob_urban': 0.8268120365000726,
  'neighbourhood': 'Mitte-Gartenfeld',
  'city': 'Trier',
  'county': None,
  'state': 'Rhineland-Palatinate',
  'region': None,
  'country': 'Germany',
  'country_code': 'de',
  'continent': None,
  'reason': 'The architecture in the image features a large stone structure with Roman-style columns and arches, which suggests it might be an ancient Roman monument.\nThe design and construction style are characteristic of Roman triumphal arches, often found in Europe where the Roman Empire once had significant influence.\nThe surrounding greenery and the layout of the area hint at a well-maintained historical site, possibly in a region that values preserving such structures.\nThe specific design of the arch, with its mu