ecotaxa · grololo06 · Dec 15, 2023 · Dec 19, 2023 · Dec 20, 2023 · Dec 22, 2023
diff --git a/.github/workflows/auto_tests.yml b/.github/workflows/auto_tests.yml
@@ -23,8 +23,8 @@ jobs:
     services:
       # Label used to access the service container
       postgres:
-        # Docker Hub image
-        image: postgres:14.9
+        # Docker Hub image with pgvector installed
+        image: ankane/pgvector:v0.2.7
         # Provide the password for postgres
         env:
           POSTGRES_PASSWORD: postgres12

diff --git a/QA/py/requirements.txt b/QA/py/requirements.txt
@@ -9,6 +9,8 @@ sqlalchemy_views==0.3.1
 # When SQLAlchemy moves to 2.0 we'll be able to use psycopg3
 #psycopg[binary]==3.0.8
 psycopg2-binary==2.9.3
+# pgvector for similarity search
+pgvector==0.2.4
 # Pillow needs manual testing
 Pillow==8.1.0
 # astral needs pytz

diff --git a/QA/py/tests/test_similarity_search.py b/QA/py/tests/test_similarity_search.py
@@ -0,0 +1,100 @@
+import logging
+import pytest
+import pandas as pd
+import numpy as np
+
+from starlette import status
+
+from tests.credentials import ADMIN_AUTH
+from tests.jobs import get_job_and_wait_until_ok, api_check_job_ok
+from tests.test_objectset_query import _prj_query
+from tests.test_classification import classify_all
+
+from BO.Prediction import DeepFeatures
+
+from API_operations.CRUD.ObjectParents import SamplesService
+
+OBJECT_SET_SIMILARITY_SEARCH_URL = "/object_set/similarity_search"
+
+copepod_id = 25828
+entomobryomorpha_id = 25835
+crustacea = 12846
+
+
+def similarity_scores(target_id, distances_to_target):
+    return [round(1 - d / max(distances_to_target), 4) for d in distances_to_target]
+
+def test_similarity_search(database, fastapi, caplog):
+    caplog.set_level(logging.ERROR)
+    from tests.test_import import test_import
+
+    prj_id = test_import(database, caplog, "Test Similarity Search")
+
+    obj_ids = _prj_query(fastapi, ADMIN_AUTH, prj_id)
+    assert len(obj_ids) == 8
+
+    # Prepare dummy features
+    features = list()
+    for i, oi in enumerate(obj_ids):
+        features.append([(i + 1) * 0.1] * 50)
+    features_df = pd.DataFrame(features, index=obj_ids)
+
+    target_id = obj_ids[0]
+    distances_to_target = [np.linalg.norm(features_df.loc[target_id] - features_df.loc[oi]) for oi in obj_ids]
+
+    # Test similarity search without features
+    url = OBJECT_SET_SIMILARITY_SEARCH_URL
+    req = {
+        "project_id": prj_id,
+        "target_id": target_id,
+    }
+    filters = {}
+    req_and_filters = {"filters": filters, "request": req}
+    rsp = fastapi.post(url, headers=ADMIN_AUTH, json=req_and_filters)
+
+    assert rsp.status_code == status.HTTP_200_OK
+    assert rsp.json()["message"] == "Missing CNN features, feature extraction job launched"
+    assert rsp.json()["neighbor_ids"] == []
+    assert rsp.json()["sim_scores"] == []
+
+    # Insert dummy features
+    with SamplesService() as sce:
+        n_inserts = DeepFeatures.save(sce.session, features_df)
+        assert n_inserts == 8
+        sce.session.commit()
+
+    # Test similarity search with features
+    url = OBJECT_SET_SIMILARITY_SEARCH_URL
+    req = {
+        "project_id": prj_id,
+        "target_id": target_id,
+    }
+    filters = {}
+    req_and_filters = {"filters": filters, "request": req}
+    rsp = fastapi.post(url, headers=ADMIN_AUTH, json=req_and_filters)
+
+    assert rsp.status_code == status.HTTP_200_OK
+    assert rsp.json()["message"] == "Success"
+    assert rsp.json()["neighbor_ids"] == obj_ids
+    assert rsp.json()["sim_scores"] == similarity_scores(target_id, distances_to_target)
+
+    # Set different taxo ids
+    classify_all(fastapi, obj_ids[0:3], copepod_id)
+    classify_all(fastapi, obj_ids[3:5], entomobryomorpha_id)
+    classify_all(fastapi, obj_ids[5:8], crustacea)
+    taxo_ids_to_filter = [copepod_id, entomobryomorpha_id]
+
+    # Test similarity search with filters on taxo
+    url = OBJECT_SET_SIMILARITY_SEARCH_URL
+    req = {
+        "project_id": prj_id,
+        "target_id": target_id,
+    }
+    filters = {"taxo": ",".join([str(taxo_id) for taxo_id in taxo_ids_to_filter])}
+    req_and_filters = {"filters": filters, "request": req}
+    rsp = fastapi.post(url, headers=ADMIN_AUTH, json=req_and_filters)
+
+    assert rsp.status_code == status.HTTP_200_OK
+    assert rsp.json()["message"] == "Success"
+    assert rsp.json()["neighbor_ids"] == obj_ids[0:5]
+    assert rsp.json()["sim_scores"] == similarity_scores(target_id, distances_to_target[0:5])
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ In this directory:
 - `py` is for Python back-end
 - `QA` contains all tests & measurements on the code.
 
+
 In `docker` one can find build scripts, as well as a simple docker-compose configuration for setting up a DB server quickly, without impacting your whole
   system. It also embeds a PgAdmin4 docker image.
 

diff --git a/py/.idea/misc.xml b/py/.idea/misc.xml
diff --git a/py/.idea/py.iml b/py/.idea/py.iml
diff --git a/py/.idea/runConfigurations/uvicorn.xml b/py/.idea/runConfigurations/uvicorn.xml
diff --git a/py/API_models/filters.py b/py/API_models/filters.py
@@ -73,6 +73,10 @@ class ProjectFiltersDict(TypedDict, total=False):
     filt_last_annot: Optional[str]
     """ Coma-separated list of annotator, i.e. person who validated the classification
         in last. """
+    seed_object_id: Optional[str]
+    """
+    Target objid for similarity search
+    """
 
 
 class _ProjectFilters2Model(DescriptiveModel):
@@ -225,6 +229,11 @@ class _ProjectFilters2Model(DescriptiveModel):
         description="Coma-separated list of annotators, i.e. persons who validated the classification in last.",
         example="34,67",
     )
+    seed_object_id = Field(
+        title="Seed object id",
+        description="Target objid for similarity search",
+        example="1234",
+    )
 
     class Config:
         schema_extra = {

diff --git a/py/API_models/simsearch.py b/py/API_models/simsearch.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+# This file is part of Ecotaxa, see license.md in the application root directory for license informations.
+# Copyright (C) 2015-2020  Picheral, Colin, Irisson (UPMC-CNRS)
+#
+from typing import List, Optional, Dict
+
+from helpers.pydantic import BaseModel, Field
+
+class SimilaritySearchReq(BaseModel):
+    """
+    Similarity search request.
+    """
+
+    project_id: int = Field(
+        title="Project Id",
+        description="The destination project, in which we want to find similar objects.",
+    )
+
+    target_id: int = Field(
+        title="Target Id",
+        description="The object we want to find similar objects for.",
+    )
+
+    class Config:
+        schema_extra = {
+            "title": "Similarity Search Request",
+            "description": "How to find similar objects, in details.",
+            "example": {
+                "project_id": 3426,
+                "target_id": 1040,
+            },
+        }
+
+
+class SimilaritySearchRsp(BaseModel):
+    """
+    Similarity search response.
+    """
+
+    neighbor_ids: List[int] = Field(
+        title="Neighbor IDs",
+        description="The list of similar objects.",
+    )
+
+    sim_scores: List[float] = Field(
+        title="Similarity Scores",
+        description="The list of similarity scores.",
+    )
+
+    message: Optional[str] = Field(
+        title="Message",
+        description="A message to the user.",
+    )
+
+    class Config:
+        schema_extra = {
+            "title": "Similarity Search Response",
+            "description": "The list of similar objects.",
+            "example": {
+                "neighbor_ids": [1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047],
+                "sim_scores": [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3],
+                "message": "Success",
+            },
+        }
diff --git a/py/API_operations/FeatureExtraction.py b/py/API_operations/FeatureExtraction.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# This file is part of Ecotaxa, see license.md in the application root directory for license informations.
+# Copyright (C) 2015-2021  Picheral, Colin, Irisson (UPMC-CNRS)
+#
+# Extract deep DeepFeatures on a project.
+#
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from BO.Rights import RightsBO, Action
+from BO.User import UserIDT
+from DB.Project import ProjectIDT, Project
+from FS.MachineLearningModels import SavedModels
+from FS.Vault import Vault
+from helpers.DynamicLogs import get_logger, LogsSwitcher
+from .helpers.JobService import JobServiceOnProjectBase, ArgsDict
+
+logger = get_logger(__name__)
+
+
+class FeatureExtractionForProject(JobServiceOnProjectBase):
+    """ """
+
+    JOB_TYPE = "FeatureExtraction"
+
+    def __init__(self, prj_id: ProjectIDT) -> None:
+        super().__init__(prj_id)
+        self.vault = Vault(self.config.vault_dir())
+        self.models_dir = SavedModels(self.config)
+
+    def run(self, current_user_id: UserIDT) -> None:
+        """
+        Initial creation, do security and consistency checks, then create the job.
+        """
+        _user, _project = RightsBO.user_wants(
+            self.session, current_user_id, Action.ANNOTATE, self.prj_id
+        )
+        # TODO: more checks, e.g. deep features models consistency
+        # Security OK, create pending job
+        self.create_job(self.JOB_TYPE, current_user_id)
+
+    def do_background(self) -> None:
+        """
+        Background part of the job.
+        """
+        with LogsSwitcher(self):
+            self.ensure_deep_features_job()
+
+    def ensure_deep_features_job(self) -> None:
+        ...
+
diff --git a/py/API_operations/GPU_FeatureExtraction.py b/py/API_operations/GPU_FeatureExtraction.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+# This file is part of Ecotaxa, see license.md in the application root directory for license informations.
+# Copyright (C) 2015-2021  Picheral, Colin, Irisson (UPMC-CNRS)
+#
+# Extract DeepFeatures on a project.
+#
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from BO.Prediction import DeepFeatures
+from BO.Rights import RightsBO, Action
+from DB.Project import ProjectIDT, Project
+from ML.Deep_features_extractor import DeepFeaturesExtractor
+from helpers.DynamicLogs import get_logger
+from .helpers.JobService import JobServiceBase
+from .FeatureExtraction import FeatureExtractionForProject
+from helpers.Timer import CodeTimer
+
+logger = get_logger(__name__)
+
+# Remove job type for base class, so during run the flow ends here
+FeatureExtractionForProject.JOB_TYPE = ""
+
+class GPUFeatureExtractionForProject(FeatureExtractionForProject):
+    """
+    Part of the feature extraction which needs special HW and libs.
+    """
+
+    JOB_TYPE = "FeatureExtraction"
+
+    DEEP_EXTRACT_CHUNK = 10000
+
+    def ensure_deep_features_job(self) -> None:
+        """
+        Ensure that deep features are present for given project.
+        """
+        proj_id = self.prj_id
+        model_name = self.prj.cnn_network_id
+
+        msg = self._ensure_deep_features_for(proj_id, model_name)
+        done_infos = {"message": msg}
+
+        self.set_job_result(errors=[], infos=done_infos)
+        return
+
+    def _ensure_deep_features_for(self, proj_id: ProjectIDT, model_name: str) -> str:
+        """
+        Ensure that deep features are present for given project.
+        """
+        # Get data i.e objects ID and images from the project
+        ids_and_images = DeepFeatures.find_missing(self.ro_session, proj_id)
+        if len(ids_and_images) == 0:
+            return "All CNN present for %d" % proj_id
+
+        # Do reasonable chunks so we can see logs...
+        nb_rows = 0
+        extractor = DeepFeaturesExtractor(self.vault, self.models_dir)
+        while len(ids_and_images) > 0:
+            chunk = {}
+            for objid, img in ids_and_images.items():
+                chunk[objid] = img
+                if len(chunk) >= self.DEEP_EXTRACT_CHUNK:
+                    break
+            for objid in chunk.keys():
+                del ids_and_images[objid]
+
+            # Call feature extractor
+            features = extractor.run(chunk, model_name)
+
+            # Save CNN
+            with CodeTimer("Saving %d new CNN " % self.DEEP_EXTRACT_CHUNK, logger):
+                nb_rows += DeepFeatures.save(self.session, features)
+            self.session.commit()
+
+        return "OK, %d CNN features computed and written for %d" % (nb_rows, proj_id)
diff --git a/py/API_operations/GPU_Prediction.py b/py/API_operations/GPU_Prediction.py
@@ -12,6 +12,7 @@
 import numpy as np
 
 from API_models.prediction import PredictionReq
+from API_operations.GPU_FeatureExtraction import GPUFeatureExtractionForProject
 from BO.Classification import ClassifIDListT
 from BO.ObjectSet import DescribedObjectSet, EnumeratedObjectSet, ObjectIDListT
 from BO.Prediction import DeepFeatures
@@ -272,8 +273,9 @@ def ensure_deep_features(self, tgt_project: Project) -> None:
         model_name = tgt_project.cnn_network_id
         assert model_name, "Target project has no cnn_network_id"
         for a_projid in [tgt_project.projid] + self.req.source_project_ids:
-            diag = self._ensure_deep_features_for(a_projid, model_name)
-            logger.info(diag)
+            with GPUFeatureExtractionForProject(a_projid) as gpu_feature_extraction:
+                diag = gpu_feature_extraction._ensure_deep_features_for(a_projid, model_name)
+                logger.info(diag)
 
     DEEP_EXTRACT_CHUNK = 10000