# Validate embeddings

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from plantclef.utils import get_spark

spark = get_spark()
display(spark)

### embeddings

In [None]:
# Get list of stored filed in cloud bucket
root = "gs://dsgt-clef-plantclef-2024"
! date
! gcloud storage ls {root}/data/process/

In [None]:
# Path and dataset names
gcs_path = "gs://dsgt-clef-plantclef-2024/data/parquet_files/"
train = "PlantCLEF2024_training_cropped_resized_v2"

# Define the GCS path to the Train parquet file
train_gcs_path = f"{gcs_path}{train}"

# Read the Parquet file into a DataFrame
train_df = spark.read.parquet(train_gcs_path)

# Show the data
train_df.show(n=5)

### subset data

In [None]:
# Path and dataset names
gcs_path = "gs://dsgt-clef-plantclef-2024/data/process"
dino_emb_train = "subset_training_cropped_resized_v2/dino/data"
dct_emb_train = "subset_training_cropped_resized_v2/dino_dct/data"

# Define the GCS path to the embedding files
dino_gcs_path = f"{gcs_path}/{dino_emb_train}"
dct_gcs_path = f"{gcs_path}/{dct_emb_train}"

# Read the Parquet file into a DataFrame
dino_df = spark.read.parquet(dino_gcs_path)
dct_df = spark.read.parquet(dct_gcs_path)

# Show the data
dino_df.show(n=5, truncate=50)
dct_df.show(n=5, truncate=50)

In [None]:
# Join with dino_df with train_df to get species names
dino_joined_df = dino_df.join(train_df, "image_name", "inner").select(
    [
        dino_df["image_name"],
        train_df["species"],
        dino_df["species_id"],
        dino_df["dino_embedding"],
    ]
)
dino_joined_df.show(n=5)

# Join dct_df with train_df to get species names
dct_joined_df = dct_df.join(train_df, "image_name", "inner").select(
    [
        dct_df["image_name"],
        train_df["species"],
        dct_df["species_id"],
        dct_df["dct_embedding"],
    ]
)

In [None]:
from plantclef.plotting import plot_images_from_embeddings

# Plot DINO image embeddings
plot_images_from_embeddings(
    dino_joined_df, data_col="dino_embedding", image_col="species", grid_size=(3, 3)
)

In [None]:
# Plot DCT image embeddings
plot_images_from_embeddings(
    dct_joined_df, data_col="dct_embedding", image_col="species", grid_size=(3, 3)
)

### full-size train data

In [None]:
# Path and dataset names
gcs_path = "gs://dsgt-clef-plantclef-2024/data/process"
dino_emb_train = "training_cropped_resized_v2/dino/data"
dct_emb_train = "training_cropped_resized_v2/dino_dct/data"

# Define the GCS path to the embedding files
dino_gcs_path = f"{gcs_path}/{dino_emb_train}"
dct_gcs_path = f"{gcs_path}/{dct_emb_train}"

# Read the Parquet file into a DataFrame
dino_df = spark.read.parquet(dino_gcs_path)
dct_df = spark.read.parquet(dct_gcs_path)

# Show the data
dino_df.show(n=5, truncate=50)
dct_df.show(n=5, truncate=50)

In [None]:
dino_joined_df.count()