## Description
The objective of this notebook is to
1. Combine both train / val dataset on the dataset page to make one very big dataset
2. For classes that are NOT in the train dataset, label them as unknown 
3. Convert the images to binary file and add it to the dataframe
4. Filter for variables in test set only
5. Convert all categorical variables (for both input and output) into numerical variables
6. Save everything into a parquet

This is built on / should replace the work done on images-to-parquet.py

In [1]:

import pandas as pd


In [3]:
DF20_train_path = "../data/FungiCLEF2023_train_metadata_PRODUCTION.csv" # This is the dev training set page
DF21_val_path = "../data/FungiCLEF2023_val_metadata_PRODUCTION.csv" # This is the "validation" set on the page. We want to use this with additional data for unknown classes
public_test_path = "../data/FungiCLEF2023_public_test_metadata_PRODUCTION.csv" # Public test set
IMG_PATH = "../data/DF"

DF20_df = pd.read_csv(DF20_train_path)
DF21_df = pd.read_csv(DF21_val_path)
test_df = pd.read_csv(public_test_path)

In [4]:
train_val_df = pd.concat((DF20_df, DF21_df))

In [5]:
# These are the metadat we want to keep to train on and potentially for prediction
test_df.keys()

Index(['observationID', 'month', 'day', 'countryCode', 'locality', 'level0Gid',
       'level0Name', 'level1Gid', 'level1Name', 'level2Gid', 'level2Name',
       'Substrate', 'Latitude', 'Longitude', 'CoorUncert', 'Habitat',
       'image_path', 'filename', 'MetaSubstrate'],
      dtype='object')

In [6]:
# Additionally, we probably want to include all the phylum, genus, etc. It might be useful for additional training data.
train_val_df.keys()

Index(['observationID', 'year', 'month', 'day', 'countryCode', 'locality',
       'taxonID', 'scientificName', 'kingdom', 'phylum', 'class', 'order',
       'family', 'genus', 'specificEpithet', 'taxonRank', 'species',
       'level0Gid', 'level0Name', 'level1Gid', 'level1Name', 'level2Gid',
       'level2Name', 'ImageUniqueID', 'Substrate', 'rightsHolder', 'Latitude',
       'Longitude', 'CoorUncert', 'Habitat', 'image_path', 'class_id',
       'MetaSubstrate', 'poisonous', 'filename'],
      dtype='object')

In [7]:
cols_to_keep = list(test_df.keys()) + ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'poisonous', 'class_id']

In [8]:
train_val_df = train_val_df[cols_to_keep]

In [9]:
import numpy as np
DUMMY_DATE = 361
train_val_df.loc[:, 'normalized_day'] = ((train_val_df['month'] - 1) * 30 + train_val_df['day']).fillna(DUMMY_DATE).astype(np.int16, copy=True)


In [10]:
categoricals = ['locality', 'level0Gid', 'level1Gid', 'level2Gid', 'Substrate', 'Habitat', 'MetaSubstrate', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species']

In [13]:
# This is important to save 
mapping = {}

for col in categoricals:
    train_val_df.sort_values(by=col, ascending=True, inplace=True)
    col_numerical, col_mapping = pd.factorize(train_val_df[col], use_na_sentinel=True)
    train_val_df.loc[:, f"{col}_numerical"] = col_numerical
    mapping[col] = {v: k for k, v in enumerate(col_mapping)}


In [14]:
import pickle

CATEGORICAL_MAPPING_LOCATION = "../data/categorical_mapping.pkl"

pickle.dump(mapping, open(CATEGORICAL_MAPPING_LOCATION, 'wb'))

In [15]:
# This should be converted to a script for submission - categorical mapping for test_df

test_categoricals = ['locality', 'level0Gid', 'level1Gid', 'level2Gid', 'Substrate', 'Habitat', 'MetaSubstrate']
mapping = pickle.load(open(CATEGORICAL_MAPPING_LOCATION, 'rb'))

for col in test_categoricals:
    test_df.loc[:, col+"_numerical"] = test_df[col].apply(lambda x: mapping[col].get(x, -1))

In [10]:
train_val_df.to_csv("train_val_df.csv", index=False)
train_val_df = pd.read_csv("train_val_df.csv")

In [17]:
# This keeps on crashing + is super inefficient :( 
# Spark doesn't work on my local machine either
# Need to adapt it with images_to_parquet.py stuff

import pyarrow as pa
import pyarrow.parquet as pq
from PIL import Image
from tqdm import tqdm

IMG_DIR = "../data/DF/"

df_records = train_val_df.to_dict("records")
CHUNKS = 50
CHUNK_SIZE = len(df_records) // CHUNKS

for i in range(CHUNKS):
    records = []
    if i == CHUNKS - 1: 
        chunk = df_records[i * CHUNK_SIZE:]
    else: 
        chunk = df_records[i * CHUNK_SIZE:(i+1) *CHUNK_SIZE]
    for r in tqdm(chunk):
        img_name = r['image_path']
        if len(img_name.split("-")[0]) == 10:

            image_path = IMG_DIR + img_name.replace("JPG", "jpg")
        else: 
            image_path = IMG_DIR + img_name
        with Image.open(image_path) as im:
            r.update({
                "img_height": im.height,
                "img_widgth": im.width,
                "data": im.tobytes()
            })
        records.append(r)

    full_df = pd.DataFrame(records)

    _dataset_chunk = pa.Table.from_pandas(full_df, preserve_index=False)
    pq.write_table(_dataset_chunk, f"../data/DF_300_{i}.parquet") # TODO: To change endpoints where these parquets are stored. But we're using spark anywayz lmao

100%|██████████| 7135/7135 [00:07<00:00, 924.61it/s] 
100%|██████████| 7135/7135 [00:11<00:00, 599.90it/s]
100%|██████████| 7135/7135 [00:11<00:00, 646.25it/s]
100%|██████████| 7135/7135 [00:12<00:00, 594.26it/s]
100%|██████████| 7135/7135 [00:10<00:00, 664.30it/s]
100%|██████████| 7135/7135 [00:11<00:00, 625.52it/s]
100%|██████████| 7135/7135 [00:11<00:00, 625.90it/s]
100%|██████████| 7135/7135 [00:11<00:00, 616.47it/s]
100%|██████████| 7135/7135 [00:11<00:00, 636.19it/s]
100%|██████████| 7135/7135 [00:12<00:00, 563.38it/s]
100%|██████████| 7135/7135 [00:10<00:00, 655.24it/s]
100%|██████████| 7135/7135 [00:11<00:00, 617.87it/s]
100%|██████████| 7135/7135 [00:11<00:00, 609.77it/s]
100%|██████████| 7135/7135 [00:11<00:00, 613.79it/s]
100%|██████████| 7135/7135 [00:11<00:00, 603.92it/s]
100%|██████████| 7135/7135 [00:11<00:00, 605.97it/s]
100%|██████████| 7135/7135 [00:11<00:00, 622.08it/s]
100%|██████████| 7135/7135 [00:11<00:00, 606.60it/s]
100%|██████████| 7135/7135 [00:14<00:00, 479.

: 