## Description
The objective of this notebook is to
1. Combine both train / val dataset on the dataset page to make one very big dataset
2. For classes that are NOT in the train dataset, label them as unknown 
3. Convert the images to binary file and add it to the dataframe
4. Filter for variables in test set only
5. Convert all categorical variables (for both input and output) into numerical variables
6. Save everything into a parquet

This is built on / should replace the work done on images-to-parquet.py

In [2]:

import pandas as pd


In [3]:
DF20_train_path = "../data/FungiCLEF2023_train_metadata_PRODUCTION.csv" # This is the dev training set page
DF21_val_path = "../data/FungiCLEF2023_val_metadata_PRODUCTION.csv" # This is the "validation" set on the page. We want to use this with additional data for unknown classes
public_test_path = "../data/FungiCLEF2023_public_test_metadata_PRODUCTION.csv" # Public test set
IMG_PATH = "../data/DF"

DF20_df = pd.read_csv(DF20_train_path)
DF21_df = pd.read_csv(DF21_val_path)
test_df = pd.read_csv(public_test_path)

In [25]:
train_val_df = pd.concat((DF20_df, DF21_df))

In [20]:
# These are the metadata we want to keep to train on and potentially for prediction
test_df.keys()

Index(['observationID', 'month', 'day', 'countryCode', 'locality', 'level0Gid',
       'level0Name', 'level1Gid', 'level1Name', 'level2Gid', 'level2Name',
       'Substrate', 'Latitude', 'Longitude', 'CoorUncert', 'Habitat',
       'image_path', 'filename', 'MetaSubstrate'],
      dtype='object')

In [21]:
# Additionally, we probably want to include all the phylum, genus, etc. It might be useful for additional training data.
train_val_df.keys()

Index(['observationID', 'year', 'month', 'day', 'countryCode', 'locality',
       'taxonID', 'scientificName', 'kingdom', 'phylum', 'class', 'order',
       'family', 'genus', 'specificEpithet', 'taxonRank', 'species',
       'level0Gid', 'level0Name', 'level1Gid', 'level1Name', 'level2Gid',
       'level2Name', 'ImageUniqueID', 'Substrate', 'rightsHolder', 'Latitude',
       'Longitude', 'CoorUncert', 'Habitat', 'image_path', 'class_id',
       'MetaSubstrate', 'poisonous', 'filename'],
      dtype='object')

In [41]:
cols_to_keep = list(test_df.keys()) + ['scientificName', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'poisonous', 'class_id']

In [42]:
train_val_df = train_val_df[cols_to_keep]
train_val_df.class_id.max()

import numpy as np
DUMMY_DATE = 361
train_val_df.loc[:, 'normalized_day'] = ((train_val_df['month'] - 1) * 30 + train_val_df['day']).fillna(DUMMY_DATE).astype(np.int16, copy=True)


In [43]:
# Make the unknown class the last class instead of -1
max_id = train_val_df.class_id.max()
train_val_df.loc[:, "class_id"] = train_val_df.class_id.apply(lambda x: max_id + 1 if x < 0 else x)

In [44]:
categoricals = ['locality', 'level0Gid', 'level1Gid', 'level2Gid', 'Substrate', 'Habitat', 'MetaSubstrate', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species']

In [13]:
# This is important to save 
mapping = {}

for col in categoricals:
    train_val_df.sort_values(by=col, ascending=True, inplace=True)
    col_numerical, col_mapping = pd.factorize(train_val_df[col], use_na_sentinel=True)
    train_val_df.loc[:, f"{col}_numerical"] = col_numerical
    mapping[col] = {v: k for k, v in enumerate(col_mapping)}


In [14]:
import pickle

CATEGORICAL_MAPPING_LOCATION = "../data/categorical_mapping.pkl"

pickle.dump(mapping, open(CATEGORICAL_MAPPING_LOCATION, 'wb'))

In [15]:
# This should be converted to a script for submission - categorical mapping for test_df

test_categoricals = ['locality', 'level0Gid', 'level1Gid', 'level2Gid', 'Substrate', 'Habitat', 'MetaSubstrate']
mapping = pickle.load(open(CATEGORICAL_MAPPING_LOCATION, 'rb'))

for col in test_categoricals:
    test_df.loc[:, col+"_numerical"] = test_df[col].apply(lambda x: mapping[col].get(x, -1))

In [10]:
train_val_df.to_csv("train_val_df.csv", index=False)
train_val_df = pd.read_csv("train_val_df.csv")

In [17]:
# This keeps on crashing + is super inefficient :( 
# Spark doesn't work on my local machine either
# Need to adapt it with images_to_parquet.py stuff

import pyarrow as pa
import pyarrow.parquet as pq
from PIL import Image
from tqdm import tqdm

IMG_DIR = "../data/DF/"

df_records = train_val_df.to_dict("records")
CHUNKS = 50
CHUNK_SIZE = len(df_records) // CHUNKS

for i in range(CHUNKS):
    records = []
    if i == CHUNKS - 1: 
        chunk = df_records[i * CHUNK_SIZE:]
    else: 
        chunk = df_records[i * CHUNK_SIZE:(i+1) *CHUNK_SIZE]
    for r in tqdm(chunk):
        img_name = r['image_path']
        if len(img_name.split("-")[0]) == 10:

            image_path = IMG_DIR + img_name.replace("JPG", "jpg")
        else: 
            image_path = IMG_DIR + img_name
        with Image.open(image_path) as im:
            r.update({
                "img_height": im.height,
                "img_widgth": im.width,
                "data": im.tobytes()
            })
        records.append(r)

    full_df = pd.DataFrame(records)

    _dataset_chunk = pa.Table.from_pandas(full_df, preserve_index=False)
    pq.write_table(_dataset_chunk, f"../data/DF_300_{i}.parquet") # TODO: To change endpoints where these parquets are stored. But we're using spark anywayz lmao

100%|██████████| 7135/7135 [00:07<00:00, 924.61it/s] 
100%|██████████| 7135/7135 [00:11<00:00, 599.90it/s]
100%|██████████| 7135/7135 [00:11<00:00, 646.25it/s]
100%|██████████| 7135/7135 [00:12<00:00, 594.26it/s]
100%|██████████| 7135/7135 [00:10<00:00, 664.30it/s]
100%|██████████| 7135/7135 [00:11<00:00, 625.52it/s]
100%|██████████| 7135/7135 [00:11<00:00, 625.90it/s]
100%|██████████| 7135/7135 [00:11<00:00, 616.47it/s]
100%|██████████| 7135/7135 [00:11<00:00, 636.19it/s]
100%|██████████| 7135/7135 [00:12<00:00, 563.38it/s]
100%|██████████| 7135/7135 [00:10<00:00, 655.24it/s]
100%|██████████| 7135/7135 [00:11<00:00, 617.87it/s]
100%|██████████| 7135/7135 [00:11<00:00, 609.77it/s]
100%|██████████| 7135/7135 [00:11<00:00, 613.79it/s]
100%|██████████| 7135/7135 [00:11<00:00, 603.92it/s]
100%|██████████| 7135/7135 [00:11<00:00, 605.97it/s]
100%|██████████| 7135/7135 [00:11<00:00, 622.08it/s]
100%|██████████| 7135/7135 [00:11<00:00, 606.60it/s]
100%|██████████| 7135/7135 [00:14<00:00, 479.

: 

In [80]:
### This is for the dev set only because the full thing keeps on crashing for me :(
selected_mushrooms = ['Neoboletus luridiformis (Rostk.) Gelardi, Simonini & Vizzini, 2014',
                      'Imleria badia (Fr.) Vizzini, 2014',
                      'Amanita muscaria (L.) Lam., 1783',
                      'Russula ochroleuca (Pers.) Fr.',
                      'Russula nigricans (Bull.) Fr.',
                      'Lactarius blennius (Fr.) Fr.'
                      ]

dev_set = train_val_df.scientificName.isin(selected_mushrooms)

dev_df = pd.concat((train_val_df[dev_set], train_val_df[train_val_df.class_id==1604].sample(1000)))

In [63]:
for col in categoricals:
    dev_df.sort_values(by=col, ascending=True, inplace=True)
    col_numerical, col_mapping = pd.factorize(dev_df[col], use_na_sentinel=True)
    dev_df.loc[:, f"{col}_numerical"] = col_numerical

# Need to do it for class_id too for dev set
dev_df.sort_values(by="class_id", ascending=True, inplace=True)
numerical, mapping = pd.factorize(dev_df.class_id)
dev_df.loc[:, f"class_id"] = numerical

In [64]:
dev_df.class_id.value_counts()

class_id
6    1000
0     959
3     938
1     925
5     709
2     648
4     567
Name: count, dtype: int64

In [65]:
# This keeps on crashing + is super inefficient :( 
# Spark doesn't work on my local machine either
# Need to adapt it with images_to_parquet.py stuff

import pyarrow as pa
import pyarrow.parquet as pq
from PIL import Image
from tqdm import tqdm

IMG_DIR = "../data/DF/"

df_records = dev_df.to_dict("records")

records = []

for r in tqdm(df_records):
    img_name = r['image_path']
    if len(img_name.split("-")[0]) == 10:

        image_path = IMG_DIR + img_name.replace("JPG", "jpg")
    else: 
        image_path = IMG_DIR + img_name
    with Image.open(image_path) as im:
        r.update({
            "img_height": im.height,
            "img_widgth": im.width,
            "data": im.tobytes()
        })
    records.append(r)

full_df = pd.DataFrame(records)

100%|██████████| 5746/5746 [00:09<00:00, 600.62it/s]


In [68]:
full_df

Unnamed: 0,observationID,month,day,countryCode,locality,level0Gid,level0Name,level1Gid,level1Name,level2Gid,...,kingdom_numerical,phylum_numerical,class_numerical,order_numerical,family_numerical,genus_numerical,species_numerical,img_height,img_widgth,data
0,2856922310,10.0,6.0,DK,Slagslunde Skov,DNK,Denmark,DNK.1_1,Hovedstaden,DNK.1.8_1,...,1,2,0,1,4,3,6,225,300,"b',G>+I?-KA.LB/MC.LB.LB,LA/OD/OD0PE1RG1RG2SH2S..."
1,2238476158,9.0,26.0,DK,Bøtø Plantage,DNK,Denmark,DNK.4_1,Sjælland,DNK.4.3_1,...,1,2,0,1,4,3,6,200,300,b'\x96\x86J\x9a\x8cO\xab\x9ca\x83w;aU\x19_U\x1...
2,2238480549,11.0,1.0,DK,"Gribskov, Kagerup Station",DNK,Denmark,DNK.1_1,Hovedstaden,DNK.1.16_1,...,1,2,0,1,4,3,6,533,300,b'\x1dUdBy~C{p:qR(^-\x17I\x0e%S\x15\x19B\n D\x...
3,2238359812,11.0,30.0,DK,Gjorslev Bøgeskov,DNK,Denmark,DNK.4_1,Sjælland,DNK.4.16_1,...,1,2,0,1,4,3,6,225,300,"b'C""\x19:\x1a\x0fF&\x1bJ,!S5*L0$1\x15\t9\x1e\x..."
4,2238034375,7.0,25.0,DK,Gatten Plantage,DNK,Denmark,DNK.3_1,Nordjylland,DNK.3.11_1,...,1,2,0,1,4,3,6,238,300,b'^UNmd]XQK\x1d\x18\x12$!\x1a\x01\x01\x00\x19\...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5741,3126942337,6.0,4.0,DK,Engene,DNK,Denmark,DNK.5_1,Syddanmark,DNK.5.18_1,...,1,2,0,46,105,144,240,237,300,b'\x16\x1a\x0b\x15\x19\n\x13\x17\t\x12\x16\x08...
5742,3131207321,6.0,10.0,DK,Bagholt Mose (Munkeskov),DNK,Denmark,DNK.4_1,Sjælland,DNK.4.1_1,...,1,2,0,46,105,144,240,400,300,b'\x9c\xa4N\x8f\x97D\x8e\x96I}\x87B~\x88L\xa2\...
5743,3113023333,5.0,14.0,DK,"Pinseskoven, Vestamager",DNK,Denmark,DNK.1_1,Hovedstaden,DNK.1.29_1,...,1,2,0,46,105,144,240,238,300,b'\xa3\x93q\xa1\x91o\xbe\xab\x8a\xa0\x8ck\x97\...
5744,3122828340,5.0,27.0,DK,Klosterhede Plantage,DNK,Denmark,DNK.2_1,Midtjylland,DNK.2.8_1,...,1,2,0,46,105,144,240,400,300,b'kx\x81mz\x83p}\x86t\x81\x8aw\x84\x8dy\x86\x8...


In [78]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(full_df, test_size=0.2, stratify=full_df.class_id)

In [79]:
_dev_train = pa.Table.from_pandas(train, preserve_index=False)
_dev_val = pa.Table.from_pandas(val, preserve_index=False)
pq.write_table(_dev_train, "../data/dev_train.parquet")
pq.write_table(_dev_val, "../data/dev_val.parquet")