## Create parquet files in GCS

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from snakeclef.utils import get_spark
import os
from pathlib import Path

spark = get_spark()
display(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/07 14:20:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/07 14:20:17 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
24/02/07 14:20:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Get list of stored filed in cloud bucket
! gcloud storage ls gs://dsgt-clef-snakeclef-2024/data/parquet_files/

gs://dsgt-clef-snakeclef-2024/data/parquet_files/

gs://dsgt-clef-snakeclef-2024/data/parquet_files/:
gs://dsgt-clef-snakeclef-2024/data/parquet_files/
gs://dsgt-clef-snakeclef-2024/data/parquet_files/SnakeCLEF2023-train-small_size/
gs://dsgt-clef-snakeclef-2024/data/parquet_files/acm_image_data_test_repartition/


In [4]:
# Get current directory
curr_dir = Path(os.getcwd())

# Change to the project directory to run the scripts
os.chdir(curr_dir.parents[1])

In [5]:
import os


def process_dataset(dataset_name, meta_dataset_name, delete_dataset: bool = True):
    # Define the base directory
    base_dir = "/mnt/data"

    # Define the path for the script and Python script
    script_path = "./scripts/download_extract_dataset.sh"
    python_script_path = "./snakeclef/images_to_parquet.py"

    # Define the GCS paths
    gcs_path = f"gs://dsgt-clef-snakeclef-2024/raw/{dataset_name}.tar.gz"
    output_path = f"gs://dsgt-clef-snakeclef-2024/data/parquet_files/{dataset_name}"

    # Adjust the dataset_name for the --dataset-name parameter
    adjusted_dataset_name = (
        dataset_name.replace("train-", "")
        .replace("val-", "")
        .replace("pubtest", "pubtest")
    )

    # Download and extract the dataset
    os.system(f"{script_path} {gcs_path} {base_dir}")

    # Create parquet file
    os.system(
        f"python {python_script_path} --output-path {output_path} --dataset-name {adjusted_dataset_name} --meta-dataset-name {meta_dataset_name}"
    )

    if delete_dataset:
        # Delete dataset locally before loading the next one
        local_data_path = f"./data/{adjusted_dataset_name}"
        local_tar_path = f"./data/{dataset_name}.tar.gz"
        os.system(f"rm -rf {local_data_path}")
        os.system(f"rm -f {local_tar_path}")

### train small size

In [None]:
# Define dataset names
data_name = "SnakeCLEF2023-train-small_size"
meta_name = "SnakeCLEF2023-TrainMetadata-iNat"

# Process the data
process_dataset(dataset_name=data_name, meta_dataset_name=meta_name)

### train medium size

In [7]:
# Define dataset names
data_name = "SnakeCLEF2023-train-medium_size"
meta_name = "SnakeCLEF2023-TrainMetadata-iNat"

# Process the data
process_dataset(dataset_name=data_name, meta_dataset_name=meta_name)

Using dataset URL: gs://dsgt-clef-snakeclef-2024/raw/SnakeCLEF2023-train-medium_size.tar.gz
Downloading dataset to: /mnt/data
Permissions set for /mnt/data.
Downloading dataset...


mount: /mnt/data: /dev/nvme0n1 already mounted on /mnt/data.
Copying gs://dsgt-clef-snakeclef-2024/raw/SnakeCLEF2023-train-medium_size.tar.gz to file:///mnt/data/SnakeCLEF2023-train-medium_size.tar.gz
  
.........................................................................................................................................................................................................................

Average throughput: 410.1MiB/s


Extracting dataset...
Dataset extracted to /mnt/data.
Final contents of /mnt/data:
SnakeCLEF2023-medium_size
SnakeCLEF2023-train-medium_size.tar.gz
lost+found
tmp
Disk usage and free space:
Filesystem      Size  Used Avail Use% Mounted on
/dev/root        49G   43G  6.1G  88% /
tmpfs           7.4G     0  7.4G   0% /dev/shm
tmpfs           3.0G  1.1M  3.0G   1% /run
tmpfs           5.0M     0  5.0M   0% /run/lock
/dev/sda15      105M  6.1M   99M   6% /boot/efi
/dev/nvme0n1    369G   35G  315G  11% /mnt/data
tmpfs           1.5G  8.0K  1.5G   1% /run/user/1005
Script completed successfully.


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/07 14:24:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/07 14:24:15 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
24/02/07 14:24:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/02/07 14:24:16 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/02/07 14:24:32 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
                                                                                

### verifying the datasets

In [16]:
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    BinaryType,
    IntegerType,
    BooleanType,
)

# Define the schema based on the provided structure
schema = StructType(
    [
        StructField("image_path", StringType(), True),
        StructField("path", StringType(), True),
        StructField("folder_name", StringType(), False),
        StructField("year", StringType(), True),
        StructField("binomial_name", StringType(), True),
        StructField("file_name", StringType(), True),
        StructField("data", BinaryType(), True),
        StructField("observation_id", IntegerType(), True),
        StructField("endemic", BooleanType(), True),
        StructField("code", StringType(), True),
        StructField("class_id", IntegerType(), True),
        StructField("subset", StringType(), True),
    ]
)

# Define the GCS path to the Parquet file
small_gcs_path = (
    "gs://dsgt-clef-snakeclef-2024/data/parquet_files/SnakeCLEF2023-train-small_size"
)
medium_gcs_path = (
    "gs://dsgt-clef-snakeclef-2024/data/parquet_files/SnakeCLEF2023-train-medium_size"
)

# Read the Parquet file into a DataFrame
sml_df = spark.read.schema(schema).parquet(small_gcs_path)
med_df = spark.read.schema(schema).parquet(medium_gcs_path)

# Show the data (for example, first few rows)
sml_df.show(n=3)

[Stage 16:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+----+--------------------+-------------+--------------------+--------------+-------+----+--------+------+
|          image_path|                path|         folder_name|year|       binomial_name|    file_name|                data|observation_id|endemic|code|class_id|subset|
+--------------------+--------------------+--------------------+----+--------------------+-------------+--------------------+--------------+-------+----+--------+------+
|1993/Phrynonax_po...|/SnakeCLEF2023-sm...|SnakeCLEF2023-sma...|1993| Phrynonax_polylepis|102870166.jpg|[FF D8 FF E0 00 1...|      64030606|  false|  EC|    1287| train|
|1995/Acrantophis_...|/SnakeCLEF2023-sm...|SnakeCLEF2023-sma...|1995|Acrantophis_dumerili| 99694826.jpg|[FF D8 FF E0 00 1...|      62240606|   true|  MG|      11| train|
|1996/Ficimia_stre...|/SnakeCLEF2023-sm...|SnakeCLEF2023-sma...|1996|   Ficimia_streckeri| 29265846.jpg|[FF D8 FF E0 00 1...|       2442697|  false|  

                                                                                

In [20]:
# Rows in small not in medium
diff_sml_med = sml_df.exceptAll(med_df)

# Rows in medium not in small
diff_med_sml = med_df.exceptAll(sml_df)

In [None]:
diff_med_sml.count()

In [None]:
diff_sml_med.count()

In [23]:
sml_df.show(n=3, truncate=100)
med_df.show(n=3, truncate=100)

                                                                                

+--------------------------------------+----------------------------------------------------------------+------------------------+----+--------------------+-------------+----------------------------------------------------------------------------------------------------+--------------+-------+----+--------+------+
|                            image_path|                                                            path|             folder_name|year|       binomial_name|    file_name|                                                                                                data|observation_id|endemic|code|class_id|subset|
+--------------------------------------+----------------------------------------------------------------+------------------------+----+--------------------+-------------+----------------------------------------------------------------------------------------------------+--------------+-------+----+--------+------+
|1993/Phrynonax_polylepis/102870166.jpg|/SnakeCLEF20

[Stage 36:>                                                         (0 + 1) / 1]

+--------------------------------------+-----------------------------------------------------------------+-------------------------+----+---------------------+------------+----------------------------------------------------------------------------------------------------+--------------+-------+-------+--------+------+
|                            image_path|                                                             path|              folder_name|year|        binomial_name|   file_name|                                                                                                data|observation_id|endemic|   code|class_id|subset|
+--------------------------------------+-----------------------------------------------------------------+-------------------------+----+---------------------+------------+----------------------------------------------------------------------------------------------------+--------------+-------+-------+--------+------+
|1991/Elaphe_quatuorlineata/3000817.j

                                                                                

In [29]:
# Perform an anti-join to find rows in med_df that do not exist in sml_df based on image_path
differences = med_df.join(sml_df, "image_path", "left_anti")

# Show the results
differences.show(n=5, truncate=100)

                                                                                

+------------------------------------------+---------------------------------------------------------------------+-------------------------+----+------------------------+-------------+----------------------------------------------------------------------------------------------------+--------------+-------+----+--------+------+
|                                image_path|                                                                 path|              folder_name|year|           binomial_name|    file_name|                                                                                                data|observation_id|endemic|code|class_id|subset|
+------------------------------------------+---------------------------------------------------------------------+-------------------------+----+------------------------+-------------+----------------------------------------------------------------------------------------------------+--------------+-------+----+--------+------+
|2019/Hyps

In [28]:
differences.count()

                                                                                

276

In [31]:
from pyspark.sql.functions import col

sml_df.where(col("image_path") == "2019/Hypsiglena_ochrorhynchus/68486239.jpg").show()



+----------+----+-----------+----+-------------+---------+----+--------------+-------+----+--------+------+
|image_path|path|folder_name|year|binomial_name|file_name|data|observation_id|endemic|code|class_id|subset|
+----------+----+-----------+----+-------------+---------+----+--------------+-------+----+--------+------+
+----------+----+-----------+----+-------------+---------+----+--------------+-------+----+--------+------+



                                                                                