In [None]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

## Steps
1. Join the kaggle contest - https://www.kaggle.com/competitions/planttraits2024
2. Install kaggle cli - https://github.com/Kaggle/kaggle-api/blob/main/docs/README.md
3. Download the data - `kaggle competitions download -c planttraits2024`
4. Unzip the data
5. Install FGVC repo - `pip install -e .` and `pip install -r requirement.txt` in the desired env
5. Train the model

## Setup

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from typing import List, Union, Callable

import fiftyone as fo
import fiftyone.core.fields as fof
import fiftyone.brain as fob
import fiftyone.zoo as foz
from fiftyone import ViewField as F
from fiftyone.core.labels import Attribute


In [None]:
label_col = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']

In [None]:
df_train = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/train.csv')
df_train['path'] = '/home/ubuntu/FGVC11/data/PlantTrait/train_images/' + df_train['id'].astype(str) + '.jpeg'
df_train.to_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_train.csv', index=False)

df_test = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/test.csv')
df_test['path'] = '/home/ubuntu/FGVC11/data/PlantTrait/test_images/' + df_test['id'].astype(str) + '.jpeg'
df_test.to_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_test.csv', index=False)

In [None]:
for column in label_col:
    upper_quantile = df_train[column].quantile(0.98)  
    df_train = df_train[(df_train[column] < upper_quantile)]
    # df_train = df_train[(df_train[column] > 0)]   
    # df_train[column] = np.log(df_train[column]) 

In [None]:
if "plant_trait" in fo.list_datasets():
    dataset = fo.load_dataset("plant_trait")
    dataset.delete()
    
samples = []
# go through all the tiles
for _, row in tqdm(df_train.iterrows(), total=len(df_train)):
    
    # add images
    sample = fo.Sample(filepath=row['path'])
    for col in label_col:
        sample[col] = row[col]
    samples.append(sample)

# initialize the dataset (restart the notebook if you face and error) 
dataset = fo.Dataset("plant_trait")
dataset.add_samples(samples)

In [None]:
# launch voxel51 on the desired port
session = fo.launch_app(dataset, port=5151)

In [None]:
# choose the model for generating the embeddings
model = foz.load_zoo_model("clip-vit-base32-torch")

emb = fob.compute_visualization(
    model=model,
    samples=dataset,
    num_dims=2,
    brain_key=f"emb",
    verbose=True,
    seed=51,
)

In [None]:
# bin df_train["X4_mean"] into 10 bins while keeping the distribution same in each
for col in label_col:
    bin_column_name = f"{col}_bin"
    df_train[bin_column_name] = "unknown"
    df_train[bin_column_name][df_train[col]>=0], cats = pd.qcut(
        df_train[col][df_train[col]>=0], q=5, labels=False, precision=3, retbins=True)
    # use cats to change the values of bin column with range
    cats = np.round(cats, 2)
    cats = [f"({cats[i]}, {cats[i+1]})" for i in range(len(cats)-1)]
    df_train[bin_column_name] = df_train[bin_column_name].map({i: cats[i] for i in range(len(cats))}) 
    df_train[bin_column_name][df_train[col]<0] = "negative"

In [None]:
for sample in tqdm(dataset):
    df_slice = df_train.loc[df_train['path'] == sample.filepath]
    for col in label_col:
        bin_column_name = f"{col}_bin"
        sample[bin_column_name] = df_slice[bin_column_name].values[0]
    sample.save()    

In [None]:
bin_columns = [f"{col}_bin" for col in label_col]

In [None]:
df_train['species'] = df_train[bin_columns].apply(lambda x: '_'.join(x.astype(str)), axis=1)


In [None]:
df_train.species.value_counts()

In [None]:
for sample in tqdm(dataset):
    df_slice = df_train.loc[df_train['path'] == sample.filepath]
    sample["species"] = df_slice["species"].values[0]
    sample.save()    

In [None]:
df_train.to_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_train_binned.csv', index=False)