In [28]:
import pandas as pd
import pygeohash as pgh
from tqdm import tqdm

In [40]:
DINO_TRAIN = "../production_dino_cls_DF_300_train.parquet"
DINO_VAL = "../production_dino_cls_DF_300_valid.parquet"
DINO_TEST = "../production_dino_cls_DF_300_test.parquet"


train_df = pd.read_parquet(DINO_TRAIN)
val_df = pd.read_parquet(DINO_VAL)
test_df = pd.read_parquet(DINO_TEST)

df_all = pd.concat((train_df, val_df, test_df))

In [37]:
# Geohash work: 
# Given that we probably only care about the 5 middle characters

pgh.encode(train_df.Latitude[4], train_df.Longitude[4])

'u1z753uvk6t5'

In [38]:
def geohash_base32_to_int(geohash):
    base32 = '0123456789bcdefghjkmnpqrstuvwxyz'
    num = 0
    for char in geohash:
        num = num * 32 + base32.index(char)
    return num

In [41]:
geohash_list = []
for _, row in tqdm(df_all.iterrows()):
    gh = pgh.encode(row.Latitude, row.Longitude)
    geohash_list.append(geohash_base32_to_int(gh[1:5]))

df_all['geohash_int'] = geohash_list


350974it [00:09, 36388.67it/s]


In [75]:
input_onehot_columns = "Substrate", "Habitat", "MetaSubstrate"

In [85]:
substrate_onehot = pd.get_dummies(df_all.Substrate, prefix="substrate").astype('int')
metasubstrate_onehot = pd.get_dummies(df_all.MetaSubstrate, prefix="metasubstrate").astype('int')
habitat_onehot = pd.get_dummies(df_all.Habitat, prefix="habitat").astype('int')

In [103]:
df_all['geohash_int_normalized'] = df_all.geohash_int / df_all.geohash_int.max()

In [106]:
INPUT_COLUMNS = ["image_path", "observationID", "embedding", "month", "geohash_int_normalized", ]
OUTPUT_COLUMNS = ['kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'poisonous', 'class_id',
       'dataset']

In [110]:
df_preprocessed = df_all[INPUT_COLUMNS].join(substrate_onehot).join(metasubstrate_onehot).join(habitat_onehot).join(df_all[OUTPUT_COLUMNS])

In [111]:
df_preprocessed[df_preprocessed.dataset=="train"].to_parquet("../DF_300_metadata_train.parquet")
df_preprocessed[df_preprocessed.dataset=="valid"].to_parquet("../DF_300_metadata_valid.parquet")
df_preprocessed[df_preprocessed.dataset=="test"].to_parquet("../DF_300_metadata_test.parquet")