# Pretrained DinoV2 model

In [2]:
%load_ext autoreload
%autoreload 2

Make directory to store the models

In [None]:
# ! mkdir -p /mnt/data/models
# %cd /mnt/data/models

copy the file from GCS to `/mnt/data/models` directory

In [None]:
# ! gsutil cp gs://dsgt-clef-plantclef-2024/data/models/PlantNet_PlantCLEF2024_pretrained_models_on_the_flora_of_south-western_europe.tar /mnt/data/models/

Unzip the `.tar` file

In [None]:
# ! tar -xvf /mnt/data/models/PlantNet_PlantCLEF2024_pretrained_models_on_the_flora_of_south-western_europe.tar -C /mnt/data/models

use pretrained model for inference

In [None]:
import pandas as pd
from urllib.request import urlopen
from PIL import Image
import timm
import torch


def load_class_mapping(class_list_file):
    with open(class_list_file) as f:
        class_index_to_class_name = {i: line.strip() for i, line in enumerate(f)}
    return class_index_to_class_name


def load_species_mapping(species_map_file):
    df = pd.read_csv(species_map_file, sep=";", quoting=1, dtype={"species_id": str})
    df = df.set_index("species_id")
    return df["species"].to_dict()


def main(image_url, class_mapping, species_mapping, pretrained_path):
    cid_to_spid = load_class_mapping(class_mapping)
    spid_to_sp = load_species_mapping(species_mapping)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = timm.create_model(
        "vit_base_patch14_reg4_dinov2.lvd142m",
        pretrained=False,
        num_classes=len(cid_to_spid),
        checkpoint_path=pretrained_path,
    )
    model = model.to(device)
    model = model.eval()

    # get model specific transforms (normalization, resize)
    data_config = timm.data.resolve_model_data_config(model)
    transforms = timm.data.create_transform(**data_config, is_training=False)

    img = None
    if "https://" in image_url or "http://" in image_url:
        img = Image.open(urlopen(image_url))
    elif image_url != None:
        img = Image.open(image_url)

    if img != None:
        img = transforms(img).unsqueeze(0)
        img = img.to(device)
        output = model(img)  # unsqueeze single image into batch of 1
        top5_probabilities, top5_class_indices = torch.topk(
            output.softmax(dim=1) * 100, k=5
        )
        top5_probabilities = top5_probabilities.cpu().detach().numpy()
        top5_class_indices = top5_class_indices.cpu().detach().numpy()

        for proba, cid in zip(top5_probabilities[0], top5_class_indices[0]):
            species_id = cid_to_spid[cid]
            species = spid_to_sp[species_id]
            print(species_id, species, proba)

In [None]:
# Define your paths and image URL directly
path = "/mnt/data/models/pretrained_models"
image_url = "https://lab.plantnet.org/LifeCLEF/PlantCLEF2024/single_plant_training_data/PlantCLEF2024singleplanttrainingdata/test/1361687/bd2d3830ac3270218ba82fd24e2290becd01317c.jpg"
class_mapping_file = f"{path}/class_mapping.txt"
species_mapping_file = f"{path}/species_id_to_name.txt"
model_path = "vit_base_patch14_reg4_dinov2_lvd142m_pc24_onlyclassifier_then_all"
pretrained_path = f"{path}/{model_path}/model_best.pth.tar"

main(image_url, class_mapping_file, species_mapping_file, pretrained_path)

In [3]:
from plantclef.utils import get_spark
from pyspark.sql import functions as F

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/11 20:26:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/11 20:26:12 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
# get dataframes
gcs_path = "gs://dsgt-clef-plantclef-2024"
test_data_path = "data/parquet_files/PlantCLEF2024_test"

# paths to dataframe
test_path = f"{gcs_path}/{test_data_path}"
# read data
test_df = spark.read.parquet(test_path)
# show
test_df.show(n=5, truncate=50)

                                                                                

+-------------------------------------------+------------------------+--------------------------------------------------+
|                                       path|              image_name|                                              data|
+-------------------------------------------+------------------------+--------------------------------------------------+
| /PlantCLEF2024test/CBN-Pla-B4-20160728.jpg| CBN-Pla-B4-20160728.jpg|[FF D8 FF E0 00 10 4A 46 49 46 00 01 01 01 00 4...|
| /PlantCLEF2024test/CBN-Pla-D3-20130808.jpg| CBN-Pla-D3-20130808.jpg|[FF D8 FF E0 00 10 4A 46 49 46 00 01 01 01 00 4...|
|/PlantCLEF2024test/CBN-PdlC-E4-20150701.jpg|CBN-PdlC-E4-20150701.jpg|[FF D8 FF E0 00 10 4A 46 49 46 00 01 01 01 00 4...|
| /PlantCLEF2024test/CBN-Pla-F5-20150901.jpg| CBN-Pla-F5-20150901.jpg|[FF D8 FF E0 00 10 4A 46 49 46 00 01 01 01 00 4...|
| /PlantCLEF2024test/CBN-Pla-D1-20180724.jpg| CBN-Pla-D1-20180724.jpg|[FF D8 FF E0 00 10 4A 46 49 46 00 01 01 01 00 4...|
+-----------------------

In [4]:
test_df.printSchema()

root
 |-- path: string (nullable = true)
 |-- image_name: string (nullable = true)
 |-- data: binary (nullable = true)



In [5]:
limit_df = test_df.limit(10).cache()
limit_df.count()

                                                                                

10

In [6]:
import io

import pandas as pd
import timm
import torch
from PIL import Image
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, FloatType, MapType, StringType
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from plantclef.model_setup import setup_pretrained_model
from pyspark.sql import DataFrame
from pyspark.ml.functions import vector_to_array
from pyspark.ml import Pipeline
from pyspark.ml.feature import SQLTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class PretrainedDinoV2(
    Transformer,
    HasInputCol,
    HasOutputCol,
    DefaultParamsReadable,
    DefaultParamsWritable,
):
    def __init__(
        self,
        pretrained_path: str,
        input_col: str = "input",
        output_col: str = "output",
        model_name: str = "vit_base_patch14_reg4_dinov2.lvd142m",
        batch_size: int = 8,
    ):
        super().__init__()
        self._setDefault(inputCol=input_col, outputCol=output_col)
        self.model_name = model_name
        self.batch_size = batch_size
        self.pretrained_path = pretrained_path
        self.num_classes = 7806  # total number of plant species
        self.local_directory = "/mnt/data/models/pretrained_models"
        self.class_mapping_file = f"{self.local_directory}/class_mapping.txt"
        # Model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = timm.create_model(
            self.model_name,
            pretrained=False,
            num_classes=self.num_classes,
            checkpoint_path=self.pretrained_path,
        )
        self.model.to(self.device)
        self.model.eval()
        # Data transform
        self.data_config = timm.data.resolve_model_data_config(self.model)
        self.transforms = timm.data.create_transform(
            **self.data_config, is_training=False
        )
        self.sql_statement = "SELECT image_name, dino_logits FROM __THIS__"
        self.cid_to_spid = self._load_class_mapping()

    def _load_class_mapping(self):
        with open(self.class_mapping_file) as f:
            class_index_to_class_name = {i: line.strip() for i, line in enumerate(f)}
        return class_index_to_class_name

    def _make_predict_fn(self):
        """Return PredictBatchFunction using a closure over the model"""

        def predict(input_data: list) -> list:
            # Load all inputs into a batch of images
            img = Image.open(io.BytesIO(input_data))
            # Transform and stack images to a single tensor
            processed_image = self.transforms(img).unsqueeze(0).to(self.device)

            with torch.no_grad():
                outputs = self.model(processed_image)
                probabilities = torch.softmax(outputs, dim=1) * 100
                top_probs, top_indices = torch.topk(probabilities, k=20)

            top_probs = top_probs.cpu().numpy()[0]
            top_indices = top_indices.cpu().numpy()[0]

            # Convert top indices and probabilities to a dictionary
            result = [
                {self.cid_to_spid.get(index, "Unknown"): float(prob)}
                for index, prob in zip(top_indices, top_probs)
            ]
            return result

        return predict

    def _transform(self, df: DataFrame):
        # Create a UDF from the predict function
        predict_fn = self._make_predict_fn()
        predict_udf = F.udf(predict_fn, ArrayType(MapType(StringType(), FloatType())))

        return df.withColumn(
            self.getOutputCol(),
            predict_udf(self.getInputCol()),
        )

    def transform(self, df) -> DataFrame:
        transformed = self._transform(df)

        for c in self.feature_columns:
            # check if the feature is a vector and convert it to an array
            if "array" in transformed.schema[c].simpleString():
                continue
            transformed = transformed.withColumn(c, vector_to_array(F.col(c)))
        return transformed

    @property
    def feature_columns(self) -> list:
        return ["dino_logits"]

    def pipeline(self):
        return Pipeline(stages=[self, SQLTransformer(statement=self.sql_statement)])

    def run(self, df: DataFrame) -> DataFrame:
        model = self.pipeline().fit(df)
        transformed = model.transform(df)

        return transformed

In [8]:
pretrained_path = setup_pretrained_model()
pretrained_dino = PretrainedDinoV2(
    pretrained_path=pretrained_path,
    input_col="data",
    output_col="dino_logits",
)

Model already exists. Skipping download and extraction.


In [21]:
transformed_df = pretrained_dino.run(df=limit_df).cache()
transformed_df.show(n=5, truncate=50)

[Stage 18:>                                                         (0 + 1) / 1]

+------------------------+--------------------------------------------------+
|              image_name|                                       dino_logits|
+------------------------+--------------------------------------------------+
| CBN-Pla-B4-20160728.jpg|[{1361281 -> 7.84536}, {1741880 -> 4.1319184}, ...|
| CBN-Pla-D3-20130808.jpg|[{1741706 -> 6.7095423}, {1741903 -> 5.509813},...|
|CBN-PdlC-E4-20150701.jpg|[{1392323 -> 6.9817915}, {1361281 -> 5.2335153}...|
| CBN-Pla-F5-20150901.jpg|[{1361281 -> 14.984538}, {1390764 -> 10.862804}...|
| CBN-Pla-D1-20180724.jpg|[{1361281 -> 35.631886}, {1651363 -> 21.19073},...|
+------------------------+--------------------------------------------------+
only showing top 5 rows



                                                                                

In [None]:
transformed_df.printSchema()

root
 |-- image_name: string (nullable = true)
 |-- dino_logits: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: float (valueContainsNull = true)



In [29]:
import csv


class PretrainedInferenceTask:
    def __init__(
        self,
        default_root_dir: str,
        k: int = 5,
    ):
        self.default_root_dir = default_root_dir
        self.k = k

    def _format_species_ids(self, species_ids: list) -> str:
        """Formats the species IDs in single square brackets, separated by commas."""
        formatted_ids = ", ".join(str(id) for id in species_ids)
        return f"[{formatted_ids}]"

    def _extract_top_k_species(self, logits: list) -> list:
        """Extracts the top k species from the logits list."""
        top_logits = [list(item.keys())[0] for item in logits[: self.k]]
        set_logits = sorted(set(top_logits), key=top_logits.index)
        return set_logits

    def _remove_extension(self, filename: str) -> str:
        """Removes the file extension from the filename."""
        return filename.rsplit(".", 1)[0]

    def _prepare_and_write_submission(self, spark_df: DataFrame) -> DataFrame:
        """Converts Spark DataFrame to Pandas, formats it, and writes to GCS."""
        records = []
        for row in spark_df.collect():
            image_name = self._remove_extension(row["image_name"])
            logits = row["dino_logits"]
            top_k_species = self._extract_top_k_species(logits)
            formatted_species = self._format_species_ids(top_k_species)
            records.append({"plot_id": image_name, "species_ids": formatted_species})

        pandas_df = pd.DataFrame(records)
        return pandas_df

    def write_submission_csv(self, df, output_path):
        """Writes the predictions to a CSV file in the specified format."""
        with open(output_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(
                file, delimiter=";", quoting=csv.QUOTE_NONE, escapechar="\\"
            )
            writer.writerow(["plot_id", "species_ids"])

            df = df.orderBy("image_name")
            for row in df.collect():
                image_name = self._remove_extension(row["image_name"])
                logits = row["dino_logits"]
                top_k_species = self._extract_top_k_species(logits)
                formatted_species = self._format_species_ids(top_k_species)
                writer.writerow([image_name, formatted_species])

    def _write_csv_to_gcs(self, df):
        """Writes the Pandas DataFrame to a CSV file in GCS."""
        output_path = f"{self.default_root_dir}/dsgt_run_top_{self.k}_species.csv"
        # Use csv module directly to control quoting more granularly
        with open(output_path, "w", newline="") as file:
            writer = csv.writer(
                file, delimiter=";", quotechar='"', quoting=csv.QUOTE_MINIMAL
            )
            writer.writerow(["plot_id", "species_ids"])
            for index, row in df.iterrows():
                writer.writerow([row["plot_id"], row["species_ids"]])

    def run(self, transformed_df: DataFrame):
        pandas_df = self._prepare_and_write_submission(transformed_df)
        self._write_csv_to_gcs(pandas_df)

In [10]:
gcs_root_path = "gs://dsgt-clef-plantclef-2024"
model_dir_path = "models/pretrained-dino"
default_root_dir = f"{gcs_root_path}/{model_dir_path}"

inference = PretrainedInferenceTask(default_root_dir, k=5)
# inference.run(transformed_df)

In [18]:
inference = PretrainedInferenceTask(default_root_dir, k=5)
inference.write_submission_csv(transformed_df, "dsgt_run.csv")

                                                                                

In [4]:
# get dataframes
gcs_path = "gs://dsgt-clef-plantclef-2024"
data_path = "data/process/pretrained_dino/dino_pretrained/data"

# paths to dataframe
df_path = f"{gcs_path}/{data_path}"
# read data
pretrained_df = spark.read.parquet(df_path)
# show
pretrained_df.show(n=5, truncate=50)

                                                                                

+------------------------+--------------------------------------------------+
|              image_name|                                       dino_logits|
+------------------------+--------------------------------------------------+
|CBN-PdlC-D4-20130903.jpg|[{1395807 -> 13.998325}, {1741880 -> 10.6929455...|
|CBN-PdlC-A3-20200722.jpg|[{1412857 -> 28.893694}, {1742052 -> 2.8326354}...|
| CBN-Pla-B4-20150723.jpg|[{1361281 -> 22.123766}, {1741880 -> 4.236441},...|
|CBN-PdlC-D5-20190722.jpg|[{1392732 -> 17.866493}, {1392323 -> 11.805281}...|
| CBN-Pla-E6-20190723.jpg|[{1397070 -> 10.023462}, {1361043 -> 2.383624},...|
+------------------------+--------------------------------------------------+
only showing top 5 rows



In [30]:
inference = PretrainedInferenceTask(default_root_dir, k=5)
inference.write_submission_csv(pretrained_df, "dsgt_run.csv")

                                                                                