Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sim search b2f #72

Closed
wants to merge 38 commits into from
Closed
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
baedf7e
wip sim search integration
moilerat Dec 15, 2023
8ce0f33
move feature extraction to import
emmaamblard Dec 19, 2023
add930f
- JsonDict -> List[Dict]
bea777515 Dec 20, 2023
5fac3bd
samples are not correct, we need to have a similar endpoint for simil…
moilerat Dec 22, 2023
43ff498
typo in endpoint that doesn't explain the fact it is not reachable fr…
moilerat Dec 22, 2023
5eaf4b1
add seed_object_id in filter to get them in api on back
moilerat Dec 22, 2023
53ff797
I would rather put out of github the .idea
moilerat Dec 23, 2023
6ded9bf
fix typo in endpoint
moilerat Dec 23, 2023
0649bea
first working version of unsupervised search in ecotaxa by lovnower, wip
moilerat Dec 23, 2023
00180b8
cleaning
moilerat Dec 23, 2023
732fa26
replace obj_cnn_features table with obj_cnn_features_vector
emmaamblard Jan 11, 2024
5547438
use postgres docker with pgvector installed for automated tests
emmaamblard Jan 11, 2024
eeb5ecb
add basic similarity search as a job + test
emmaamblard Jan 23, 2024
57bc453
attempt to fix import bug in simsearch test
emmaamblard Jan 23, 2024
4899815
add filters to similarity search
emmaamblard Jan 24, 2024
ab3ef53
remove print and broken test
emmaamblard Jan 24, 2024
4cf7731
launch feature extraction job before similarity search
emmaamblard Jan 25, 2024
ff780aa
add similarity scores + test similarity search with filters on taxo
emmaamblard Jan 26, 2024
fa2f418
fix filters bug
emmaamblard Jan 30, 2024
5a49108
replace function used in similarity search endpoint
emmaamblard Jan 30, 2024
5ebce10
remove limit to similarity search results set by the object_set query
emmaamblard Jan 31, 2024
511a7fd
remove prints
emmaamblard Jan 31, 2024
64579a4
add blank new line in README to test right to commit and config of lo…
Feb 15, 2024
a9cbb3f
first version script upgrade db with alchemy
Feb 16, 2024
e0475c3
mention emma and lovnower for ss and cnnvector in license (only comme…
moilerat Feb 17, 2024
94bc185
fix check that cnn_features are selected https://github.com/ecotaxa/e…
moilerat Feb 17, 2024
a1a9abc
trying to take into account @grololo remark https://github.com/ecotax…
moilerat Feb 17, 2024
e673612
update test value for missing descriptors in test ss https://github.c…
Feb 17, 2024
a98114f
using pgvector/pgvector:pg16 docker in NRT github actions tests after…
moilerat Feb 18, 2024
0b8d146
remove blank line to force github actions that weren't trigge by chan…
moilerat Feb 19, 2024
e34102b
change version migration after following different advice
Feb 19, 2024
7068f40
Merge branch 'sim_search_b2f' of github.com:ecotaxa/ecotaxa_back into…
Feb 19, 2024
f48d3a2
remove useless migration create and drop linked to 'scories' with R i…
moilerat Feb 19, 2024
3707c44
remove 'scories' due to different management of password in prod and …
moilerat Feb 19, 2024
f1c90fe
script migrate data cnnfeature vector
moilerat Feb 19, 2024
fb7b430
# Conflicts:
grololo06 Feb 19, 2024
b60f723
fix: upgrade script for new pgvector table
grololo06 Feb 19, 2024
190852f
Merge branch 'master' into sim_search_b2f
moilerat Feb 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/auto_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ jobs:
services:
# Label used to access the service container
postgres:
# Docker Hub image
image: postgres:14.9
# Docker Hub image with pgvector installed
image: ankane/pgvector:v0.2.7
# Provide the password for postgres
env:
POSTGRES_PASSWORD: postgres12
Expand Down
2 changes: 2 additions & 0 deletions QA/py/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ sqlalchemy_views==0.3.1
# When SQLAlchemy moves to 2.0 we'll be able to use psycopg3
#psycopg[binary]==3.0.8
psycopg2-binary==2.9.3
# pgvector for similarity search
pgvector==0.2.4
# Pillow needs manual testing
Pillow==8.1.0
# astral needs pytz
Expand Down
100 changes: 100 additions & 0 deletions QA/py/tests/test_similarity_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import logging
import pytest
import pandas as pd
import numpy as np

from starlette import status

from tests.credentials import ADMIN_AUTH
from tests.jobs import get_job_and_wait_until_ok, api_check_job_ok
from tests.test_objectset_query import _prj_query
from tests.test_classification import classify_all

from BO.Prediction import DeepFeatures

from API_operations.CRUD.ObjectParents import SamplesService

OBJECT_SET_SIMILARITY_SEARCH_URL = "/object_set/similarity_search"

copepod_id = 25828
entomobryomorpha_id = 25835
crustacea = 12846


def similarity_scores(target_id, distances_to_target):
return [round(1 - d / max(distances_to_target), 4) for d in distances_to_target]

def test_similarity_search(database, fastapi, caplog):
caplog.set_level(logging.ERROR)
from tests.test_import import test_import

prj_id = test_import(database, caplog, "Test Similarity Search")

obj_ids = _prj_query(fastapi, ADMIN_AUTH, prj_id)
assert len(obj_ids) == 8

# Prepare dummy features
features = list()
for i, oi in enumerate(obj_ids):
features.append([(i + 1) * 0.1] * 50)
features_df = pd.DataFrame(features, index=obj_ids)

target_id = obj_ids[0]
distances_to_target = [np.linalg.norm(features_df.loc[target_id] - features_df.loc[oi]) for oi in obj_ids]

# Test similarity search without features
url = OBJECT_SET_SIMILARITY_SEARCH_URL
req = {
"project_id": prj_id,
"target_id": target_id,
}
filters = {}
req_and_filters = {"filters": filters, "request": req}
rsp = fastapi.post(url, headers=ADMIN_AUTH, json=req_and_filters)

assert rsp.status_code == status.HTTP_200_OK
assert rsp.json()["message"] == "Missing CNN features, feature extraction job launched"
assert rsp.json()["neighbor_ids"] == []
assert rsp.json()["sim_scores"] == []

# Insert dummy features
with SamplesService() as sce:
n_inserts = DeepFeatures.save(sce.session, features_df)
assert n_inserts == 8
sce.session.commit()

# Test similarity search with features
url = OBJECT_SET_SIMILARITY_SEARCH_URL
req = {
"project_id": prj_id,
"target_id": target_id,
}
filters = {}
req_and_filters = {"filters": filters, "request": req}
rsp = fastapi.post(url, headers=ADMIN_AUTH, json=req_and_filters)

assert rsp.status_code == status.HTTP_200_OK
assert rsp.json()["message"] == "Success"
assert rsp.json()["neighbor_ids"] == obj_ids
assert rsp.json()["sim_scores"] == similarity_scores(target_id, distances_to_target)

# Set different taxo ids
classify_all(fastapi, obj_ids[0:3], copepod_id)
classify_all(fastapi, obj_ids[3:5], entomobryomorpha_id)
classify_all(fastapi, obj_ids[5:8], crustacea)
taxo_ids_to_filter = [copepod_id, entomobryomorpha_id]

# Test similarity search with filters on taxo
url = OBJECT_SET_SIMILARITY_SEARCH_URL
req = {
"project_id": prj_id,
"target_id": target_id,
}
filters = {"taxo": ",".join([str(taxo_id) for taxo_id in taxo_ids_to_filter])}
req_and_filters = {"filters": filters, "request": req}
rsp = fastapi.post(url, headers=ADMIN_AUTH, json=req_and_filters)

assert rsp.status_code == status.HTTP_200_OK
assert rsp.json()["message"] == "Success"
assert rsp.json()["neighbor_ids"] == obj_ids[0:5]
assert rsp.json()["sim_scores"] == similarity_scores(target_id, distances_to_target[0:5])
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ In this directory:
- `py` is for Python back-end
- `QA` contains all tests & measurements on the code.


In `docker` one can find build scripts, as well as a simple docker-compose configuration for setting up a DB server quickly, without impacting your whole
system. It also embeds a PgAdmin4 docker image.

Expand Down
2 changes: 1 addition & 1 deletion py/.idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion py/.idea/py.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions py/.idea/runConfigurations/uvicorn.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions py/API_models/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ class ProjectFiltersDict(TypedDict, total=False):
filt_last_annot: Optional[str]
""" Coma-separated list of annotator, i.e. person who validated the classification
in last. """
seed_object_id: Optional[str]
"""
Target objid for similarity search
"""


class _ProjectFilters2Model(DescriptiveModel):
Expand Down Expand Up @@ -225,6 +229,11 @@ class _ProjectFilters2Model(DescriptiveModel):
description="Coma-separated list of annotators, i.e. persons who validated the classification in last.",
example="34,67",
)
seed_object_id = Field(
title="Seed object id",
description="Target objid for similarity search",
example="1234",
)

class Config:
schema_extra = {
Expand Down
64 changes: 64 additions & 0 deletions py/API_models/simsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
# This file is part of Ecotaxa, see license.md in the application root directory for license informations.
# Copyright (C) 2015-2020 Picheral, Colin, Irisson (UPMC-CNRS)
#
from typing import List, Optional, Dict

from helpers.pydantic import BaseModel, Field

class SimilaritySearchReq(BaseModel):
"""
Similarity search request.
"""

project_id: int = Field(
title="Project Id",
description="The destination project, in which we want to find similar objects.",
)

target_id: int = Field(
title="Target Id",
description="The object we want to find similar objects for.",
)

class Config:
schema_extra = {
"title": "Similarity Search Request",
"description": "How to find similar objects, in details.",
"example": {
"project_id": 3426,
"target_id": 1040,
},
}


class SimilaritySearchRsp(BaseModel):
"""
Similarity search response.
"""

neighbor_ids: List[int] = Field(
title="Neighbor IDs",
description="The list of similar objects.",
)

sim_scores: List[float] = Field(
title="Similarity Scores",
description="The list of similarity scores.",
)

message: Optional[str] = Field(
title="Message",
description="A message to the user.",
)

class Config:
schema_extra = {
"title": "Similarity Search Response",
"description": "The list of similar objects.",
"example": {
"neighbor_ids": [1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047],
"sim_scores": [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3],
"message": "Success",
},
}
52 changes: 52 additions & 0 deletions py/API_operations/FeatureExtraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
# This file is part of Ecotaxa, see license.md in the application root directory for license informations.
# Copyright (C) 2015-2021 Picheral, Colin, Irisson (UPMC-CNRS)
#
# Extract deep DeepFeatures on a project.
#
from typing import Dict, List, Tuple

import numpy as np

from BO.Rights import RightsBO, Action
from BO.User import UserIDT
from DB.Project import ProjectIDT, Project
from FS.MachineLearningModels import SavedModels
from FS.Vault import Vault
from helpers.DynamicLogs import get_logger, LogsSwitcher
from .helpers.JobService import JobServiceOnProjectBase, ArgsDict

logger = get_logger(__name__)


class FeatureExtractionForProject(JobServiceOnProjectBase):
""" """

JOB_TYPE = "FeatureExtraction"

def __init__(self, prj_id: ProjectIDT) -> None:
super().__init__(prj_id)
self.vault = Vault(self.config.vault_dir())
self.models_dir = SavedModels(self.config)

def run(self, current_user_id: UserIDT) -> None:
"""
Initial creation, do security and consistency checks, then create the job.
"""
_user, _project = RightsBO.user_wants(
self.session, current_user_id, Action.ANNOTATE, self.prj_id
)
# TODO: more checks, e.g. deep features models consistency
# Security OK, create pending job
self.create_job(self.JOB_TYPE, current_user_id)

def do_background(self) -> None:
"""
Background part of the job.
"""
with LogsSwitcher(self):
self.ensure_deep_features_job()

def ensure_deep_features_job(self) -> None:
...

76 changes: 76 additions & 0 deletions py/API_operations/GPU_FeatureExtraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
# This file is part of Ecotaxa, see license.md in the application root directory for license informations.
# Copyright (C) 2015-2021 Picheral, Colin, Irisson (UPMC-CNRS)
#
# Extract DeepFeatures on a project.
#
from typing import Dict, List, Tuple

import numpy as np

from BO.Prediction import DeepFeatures
from BO.Rights import RightsBO, Action
from DB.Project import ProjectIDT, Project
from ML.Deep_features_extractor import DeepFeaturesExtractor
from helpers.DynamicLogs import get_logger
from .helpers.JobService import JobServiceBase
from .FeatureExtraction import FeatureExtractionForProject
from helpers.Timer import CodeTimer

logger = get_logger(__name__)

# Remove job type for base class, so during run the flow ends here
FeatureExtractionForProject.JOB_TYPE = ""

class GPUFeatureExtractionForProject(FeatureExtractionForProject):
"""
Part of the feature extraction which needs special HW and libs.
"""

JOB_TYPE = "FeatureExtraction"

DEEP_EXTRACT_CHUNK = 10000

def ensure_deep_features_job(self) -> None:
"""
Ensure that deep features are present for given project.
"""
proj_id = self.prj_id
model_name = self.prj.cnn_network_id

msg = self._ensure_deep_features_for(proj_id, model_name)
done_infos = {"message": msg}

self.set_job_result(errors=[], infos=done_infos)
return

def _ensure_deep_features_for(self, proj_id: ProjectIDT, model_name: str) -> str:
"""
Ensure that deep features are present for given project.
"""
# Get data i.e objects ID and images from the project
ids_and_images = DeepFeatures.find_missing(self.ro_session, proj_id)
if len(ids_and_images) == 0:
return "All CNN present for %d" % proj_id

# Do reasonable chunks so we can see logs...
nb_rows = 0
extractor = DeepFeaturesExtractor(self.vault, self.models_dir)
while len(ids_and_images) > 0:
chunk = {}
for objid, img in ids_and_images.items():
chunk[objid] = img
if len(chunk) >= self.DEEP_EXTRACT_CHUNK:
break
for objid in chunk.keys():
del ids_and_images[objid]

# Call feature extractor
features = extractor.run(chunk, model_name)

# Save CNN
with CodeTimer("Saving %d new CNN " % self.DEEP_EXTRACT_CHUNK, logger):
nb_rows += DeepFeatures.save(self.session, features)
self.session.commit()

return "OK, %d CNN features computed and written for %d" % (nb_rows, proj_id)
6 changes: 4 additions & 2 deletions py/API_operations/GPU_Prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np

from API_models.prediction import PredictionReq
from API_operations.GPU_FeatureExtraction import GPUFeatureExtractionForProject
from BO.Classification import ClassifIDListT
from BO.ObjectSet import DescribedObjectSet, EnumeratedObjectSet, ObjectIDListT
from BO.Prediction import DeepFeatures
Expand Down Expand Up @@ -272,8 +273,9 @@ def ensure_deep_features(self, tgt_project: Project) -> None:
model_name = tgt_project.cnn_network_id
assert model_name, "Target project has no cnn_network_id"
for a_projid in [tgt_project.projid] + self.req.source_project_ids:
diag = self._ensure_deep_features_for(a_projid, model_name)
logger.info(diag)
with GPUFeatureExtractionForProject(a_projid) as gpu_feature_extraction:
diag = gpu_feature_extraction._ensure_deep_features_for(a_projid, model_name)
logger.info(diag)

DEEP_EXTRACT_CHUNK = 10000

Expand Down
Loading
Loading