In [1]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

## Steps
1. Join the kaggle contest - https://www.kaggle.com/competitions/planttraits2024
2. Install kaggle cli - https://github.com/Kaggle/kaggle-api/blob/main/docs/README.md
3. Download the data - `kaggle competitions download -c planttraits2024`
4. Unzip the data
5. Install FGVC repo - `pip install -e .` and `pip install -r requirement.txt` in the desired env
5. Train the model

## Setup

In [2]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from typing import List, Union, Callable 

import fiftyone as fo
import fiftyone.core.fields as fof
import fiftyone.brain as fob
import fiftyone.zoo as foz
from fiftyone import ViewField as F
from fiftyone.core.labels import Attribute
from torchmetrics.functional import r2_score


In [3]:
label_col = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
label_col_pred = ['X4_mean_pred', 'X11_mean_pred', 'X18_mean_pred', 'X50_mean_pred', 'X26_mean_pred', 'X3112_mean_pred']
delta_cols = ['X4_delta', 'X11_delta', 'X18_delta', 'X50_delta', 'X26_delta', 'X3112_delta']
target_col = ['X4', 'X11', 'X18', 'X50', 'X26', 'X3112']

In [4]:
df_complete = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_complete.csv')
df_complete[delta_cols] = abs(df_complete[label_col].values - df_complete[label_col_pred].values)
df_test = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/test.csv')
df_test['path'] = '/home/ubuntu/FGVC11/data/PlantTrait/test_images/' + df_test['id'].astype(str) + '.jpeg'
pred_test = pd.read_csv('mapped_sub.csv')
pred_test["path"] = df_test["path"]
# convert pred test columns from target columns to pred Columns name
pred_test[label_col_pred] = pred_test[target_col]

In [25]:
df_complete[label_col].describe()

Unnamed: 0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
count,5687.0,5687.0,5687.0,5687.0,5687.0,5687.0
mean,0.598138,1154.986,192183.4,118.129772,33554.84,3865243.0
std,0.356051,39915.61,7213103.0,4236.061114,771547.1,285889700.0
min,-2.431157,6.78e-05,2.33e-08,9.7e-05,5.5e-07,7.69e-08
25%,0.443771,6.635434,0.6394602,1.079218,2.126297,327.3334
50%,0.590883,11.67053,6.075979,1.706577,14.76781,1965.398
75%,0.768214,23.13592,18.63785,2.621856,296.7606,7858.094
max,4.475172,1504254.0,272049400.0,159759.8977,31065550.0,21559110000.0


In [26]:
df_proc = pd.read_csv("/home/ubuntu/FGVC11/data/PlantTrait/df_processed.csv")

In [27]:
df_proc[label_col].describe()

Unnamed: 0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
count,55554.0,55554.0,55554.0,55554.0,55554.0,55554.0
mean,0.452789,13.907804,2.252462,1.393314,16.969752,1300.792687
std,0.209653,8.189753,4.04912,0.748973,49.908438,1951.040578
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.368824,8.590398,0.20196,1.022802,0.202938,143.250464
50%,0.473778,14.343725,0.510616,1.393763,1.45699,519.442038
75%,0.598398,18.934532,2.013909,1.806646,8.489919,1605.33266
max,0.8566,40.478892,23.294858,3.584879,492.559161,12978.600313


In [7]:
df_train = df_complete.copy()
take_index = []
for col in label_col:
    upper_quantile = df_train[col].quantile(0.98)  
    take_index += list(df_train[(df_train[col] > upper_quantile)].index.values)
    take_index += list(df_train[(df_train[col] < 0)].index.values)  
    # break

# r2_score(torch.tensor(df_train[label_col_pred].values), torch.tensor(df_train[label_col].values))

In [8]:
len(take_index)

6813

In [9]:
take_index = set(take_index)
len(take_index)

5687

In [11]:
take_index = list(take_index)
df_train = df_train.iloc[take_index] 

In [14]:
df_train.reset_index(drop=True, inplace=True)

In [16]:
r2_score(torch.tensor(df_train[label_col_pred].values), torch.tensor(df_train[label_col].values))

tensor(0.0147, dtype=torch.float64)

In [None]:
df_complete[label_col][df_complete.path == "/home/ubuntu/FGVC11/data/PlantTrait/train_images/196678324.jpeg"]

In [None]:
df_complete

In [None]:
df_complete.shape, df_test.shape

In [17]:
if "plant_trait" in fo.list_datasets():
    dataset = fo.load_dataset("plant_trait")
    dataset.delete()
    
samples = []
df_complete = df_train.copy()
# go through all the tiles
for _, row in tqdm(df_complete.iterrows(), total=len(df_complete)):
    
    # add images
    sample = fo.Sample(filepath=row['path'])
    for col in label_col + label_col_pred + delta_cols:
        sample[col] = row[col]
    sample["split"] = "train"
    samples.append(sample)

for _, row in tqdm(pred_test.iterrows(), total=len(pred_test)):
    
    # add images
    sample = fo.Sample(filepath=row['path'])
    for col in label_col_pred:
        sample[col] = row[col]
    sample["split"] = "test"
    samples.append(sample)

# initialize the dataset (restart the notebook if you face and error) 
dataset = fo.Dataset("plant_trait")
dataset.add_samples(samples)

100%|██████████| 5687/5687 [00:01<00:00, 4048.85it/s]
100%|██████████| 6545/6545 [00:00<00:00, 7652.54it/s]


 100% |█████████████| 12232/12232 [3.1s elapsed, 0s remaining, 4.3K samples/s]      


['66209b9856727b419866e40a',
 '66209b9856727b419866e40b',
 '66209b9856727b419866e40c',
 '66209b9856727b419866e40d',
 '66209b9856727b419866e40e',
 '66209b9856727b419866e40f',
 '66209b9856727b419866e410',
 '66209b9856727b419866e411',
 '66209b9856727b419866e412',
 '66209b9856727b419866e413',
 '66209b9856727b419866e414',
 '66209b9856727b419866e415',
 '66209b9856727b419866e416',
 '66209b9856727b419866e417',
 '66209b9856727b419866e418',
 '66209b9856727b419866e419',
 '66209b9856727b419866e41a',
 '66209b9856727b419866e41b',
 '66209b9856727b419866e41c',
 '66209b9856727b419866e41d',
 '66209b9856727b419866e41e',
 '66209b9856727b419866e41f',
 '66209b9856727b419866e420',
 '66209b9856727b419866e421',
 '66209b9856727b419866e422',
 '66209b9856727b419866e423',
 '66209b9856727b419866e424',
 '66209b9856727b419866e425',
 '66209b9856727b419866e426',
 '66209b9856727b419866e427',
 '66209b9856727b419866e428',
 '66209b9856727b419866e429',
 '66209b9856727b419866e42a',
 '66209b9856727b419866e42b',
 '66209b985672

In [18]:
# launch voxel51 on the desired port
session = fo.launch_app(dataset, port=5151)

In [23]:
# # choose the model for generating the embeddings
# model = foz.load_zoo_model("dinov2-vitl14-torch")

# emb = fob.compute_visualization(
#     model=model,
#     samples=dataset,
#     num_dims=2,
#     brain_key=f"dino",
#     verbose=True,
#     seed=51,
# )

In [20]:
# choose the model for generating the embeddings
model = foz.load_zoo_model("clip-vit-base32-torch")

emb = fob.compute_visualization(
    model=model,
    samples=dataset,
    num_dims=2,
    brain_key=f"clip",
    verbose=True,
    seed=51,
)

Computing embeddings...
 100% |█████████████| 12232/12232 [1.3m elapsed, 0s remaining, 137.5 samples/s]      
Generating visualization...


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


UMAP(random_state=51, verbose=True)
Thu Apr 18 04:06:41 2024 Construct fuzzy simplicial set
Thu Apr 18 04:06:41 2024 Finding Nearest Neighbors
Thu Apr 18 04:06:41 2024 Building RP forest with 11 trees
Thu Apr 18 04:06:44 2024 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	Stopping threshold met -- exiting after 5 iterations
Thu Apr 18 04:06:54 2024 Finished Nearest Neighbor Search
Thu Apr 18 04:06:56 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Thu Apr 18 04:07:02 2024 Finished embedding


In [None]:
/home/ubuntu/FGVC11/data/PlantTrait/test_images/202152252.jpeg
/home/ubuntu/FGVC11/data/PlantTrait/test_images/202380494.jpeg

In [None]:
202152252	0.4458284815	12.00753201	5.890396804	1.764895206	44.55121753	3587.80359325
202380494	0.4183258879	12.69283430	4.525760049	1.611317929	47.24508166	3347.0889295

In [None]:
fob.compute_similarity(
    dataset,
    model="clip-vit-base32-torch",
    brain_key="um",
    # backend="qdrant"
)

In [None]:
# bin df_train["X4_mean"] into 10 bins while keeping the distribution same in each
for col in label_col:
    bin_column_name = f"{col}_bin"
    df_train[bin_column_name] = "unknown"
    df_train[bin_column_name][df_train[col]>=0], cats = pd.qcut(
        df_train[col][df_train[col]>=0], q=5, labels=False, precision=3, retbins=True)
    # use cats to change the values of bin column with range
    cats = np.round(cats, 2)
    cats = [f"({cats[i]}, {cats[i+1]})" for i in range(len(cats)-1)]
    df_train[bin_column_name] = df_train[bin_column_name].map({i: cats[i] for i in range(len(cats))}) 
    df_train[bin_column_name][df_train[col]<0] = "negative"

In [None]:
for sample in tqdm(dataset):
    df_slice = df_train.loc[df_train['path'] == sample.filepath]
    for col in label_col:
        bin_column_name = f"{col}_bin"
        sample[bin_column_name] = df_slice[bin_column_name].values[0]
    sample.save()    

In [None]:
bin_columns = [f"{col}_bin" for col in label_col]

In [None]:
df_train['species'] = df_train[bin_columns].apply(lambda x: '_'.join(x.astype(str)), axis=1)


In [None]:
df_train.species.value_counts()

In [None]:
for sample in tqdm(dataset):
    df_slice = df_train.loc[df_train['path'] == sample.filepath]
    sample["species"] = df_slice["species"].values[0]
    sample.save()    

In [None]:
df_train.to_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_train_binned.csv', index=False)

In [None]:
sub  = pd.read_csv('edit_sub.csv')

In [None]:
sub[sub.columns[:-1]].to_csv('edit_sub_corr.csv', index=False)

In [None]:
!kaggle competitions submit -c planttraits2024 -f edit_sub_corr.csv -m "manual addition of corrpted data"