In [1]:
# from scipy.fftpack import dctn, dct
import pandas as pd


In [2]:
from google.cloud import storage

def list_blobs_with_prefix(bucket_name, prefix, delimiter=None):
    """Lists all the blobs in the bucket that begin with the prefix.

    This can be used to list all blobs in a "folder", e.g. "public/".

    The delimiter argument can be used to restrict the results to only the
    "files" in the given "folder". Without the delimiter, the entire tree under
    the prefix is returned. For example, given these blobs:

        a/1.txt
        a/b/2.txt

    If you specify prefix ='a/', without a delimiter, you'll get back:

        a/1.txt
        a/b/2.txt

    However, if you specify prefix='a/' and delimiter='/', you'll get back
    only the file directly under 'a/':

        a/1.txt

    As part of the response, you'll also get back a blobs.prefixes entity
    that lists the "subfolders" under `a/`:

        a/b/
    """

    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)

    # Note: The call returns a response only when the iterator is consumed.

    file_list = []
    for blob in blobs:
        if ".parquet" in blob.name: 
            file_list.append("gs://"+bucket_name+"/"+blob.name)

    return file_list

dino_outputs = list_blobs_with_prefix("dsgt-clef-fungiclef-2024", prefix="data/parquet/DF20_300px_and_DF21_300px_corrected_FULL_SET_embedding/dino/data/")

In [5]:
import pandas as pd
df = pd.read_parquet(dino_outputs[1])

In [4]:
DINO_SHAPE=(257, 768)

def process_hidden_states(df):
    rows = []
    for _, row in df.iterrows():
        hidden_state = row.dino_embedding.reshape(DINO_SHAPE)
        cls_token = hidden_state[0]
        # dct_16_1d = dct(hidden_state[1:], axis=-1)[:, :16]
        # dct_64_2d = dctn(hidden_state[1:])[:64, :64]
        # rows.append(dict(cls_token=cls_token.tolist(), dct_16_1d=dct_16_1d.tolist(), dct_64_2d=dct_64_2d.tolist()))
        rows.append({'image_path': row.image_path, 'dino_output': cls_token.tolist()})
    return rows

In [7]:
# import pandas as pd
# from concurrent.futures import ThreadPoolExecutor
# from tqdm import tqdm

# def process_file(file_path):
#     df = pd.read_parquet(file_path)
#     _df = pd.DataFrame([df.image_path, df.dino_embedding.apply(lambda x: x[:768])]).T
#     return _df

# with ThreadPoolExecutor(max_workers=2) as executor:
#     all_rows = list(tqdm(executor.map(process_file, dino_outputs), total=1000))


  0%|          | 0/1000 [00:00<?, ?it/s]

  5%|▍         | 48/1000 [01:02<22:39,  1.43s/it]

: 

In [21]:
import pandas as pd
from tqdm import tqdm

all_df = []
for ix, file_path in enumerate(tqdm(dino_outputs)):
    df = pd.read_parquet(file_path)
    _df = pd.DataFrame([df.image_path, df.dino_embedding.apply(lambda x: x[:768].tolist())]).T
    _df.to_csv(f"./tmp/dino_cls_{ix}.csv", index=False)


100%|██████████| 1000/1000 [51:04<00:00,  3.06s/it] 


In [23]:
all_dfs = []
for i in tqdm(range(1000)):
    df = pd.read_csv(f'./tmp/dino_cls_{i}.csv')
    all_dfs.append(df)



100%|██████████| 1000/1000 [00:54<00:00, 18.50it/s]


In [24]:
df = pd.concat(all_dfs)

In [27]:
df.to_parquet("dino_embeddings_all.parquet", index=False)

In [28]:
from fungiclef.utils import get_spark, spark_resource, read_config
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
config = read_config(path='../fungiclef/config.json')

# First, we read the metadata for the dataset and make a proper new one. This will be the single source of truth we use to build the rest of our stuff on
# This corresponds to the DF20 dataset
TRAIN_METADATA = config["gs_paths"]["train"]["metadata"]

# These two correspond to the DF21 dataset
VALID_METADATA = config["gs_paths"]["val"]["metadata"]
TEST_METADATA = config["gs_paths"]["test"]["metadata"]

PRODUCTION_BUCKET = 'gs://dsgt-clef-fungiclef-2024/production/'

# Here, we are only keeping columns that are relevant either for training or inference. 
# This includes all the columns that were present in the public test metadata dataset
TEST_DF_COLUMNS = ['observationID', 'month', 'day', 'countryCode', 'locality', 'level0Gid',
       'level0Name', 'level1Gid', 'level1Name', 'level2Gid', 'level2Name',
       'Substrate', 'Latitude', 'Longitude', 'CoorUncert', 'Habitat',
       'image_path', 'filename', 'MetaSubstrate']

# As well as the overall classification of the fungi (this could potentially be useful as additional training targets)
COLUMNS_TO_KEEP = TEST_DF_COLUMNS + ['scientificName', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'poisonous', 'class_id', 'dataset']

# These are the categorical columns we will need to factorize and generate labels for
CATEGORICAL_COLUMNS = ['locality', 'level0Gid', 'level1Gid', 'level2Gid', 'Substrate', 'Habitat', 'MetaSubstrate', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species']

In [31]:
selected_metadata_df = pd.read_csv(PRODUCTION_BUCKET + "metadata/DF_combined_metadata_mapped_columns.csv")
# For pairing up with embeddings, we will use numerical data only so there is less data to load etc
numerical_metadata_df = selected_metadata_df.drop([c + "_text" for c in CATEGORICAL_COLUMNS], axis=1)
numerical_metadata_df = numerical_metadata_df.drop(['filename', 'scientificName', 'countryCode', 'level0Name', 'level1Name', 'level2Name'], axis=1)

numerical_metadata_df['image_path'] = numerical_metadata_df.image_path.apply(lambda x: x.replace(".JPG", ".jpg"))

In [32]:
# Match dataset by image_path
dino_full_df = numerical_metadata_df.set_index('image_path').join(df.set_index('image_path')).reset_index()


In [38]:
DATASET_PATH = "gs://dsgt-clef-fungiclef-2024/production/dino_cls/"
dino_full_df[dino_full_df.dataset=="train"].to_parquet(DATASET_PATH + "DF_300_train.parquet")
dino_full_df[dino_full_df.dataset=="valid"].to_parquet(DATASET_PATH + "DF_300_valid.parquet")
dino_full_df[dino_full_df.dataset=="test"].to_parquet(DATASET_PATH + "DF_300_test.parquet")