In [None]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import omegaconf
import hydra
from lightning import Callback, LightningDataModule, LightningModule, Trainer
from omegaconf import DictConfig
from sklearn.preprocessing import StandardScaler
from fgvc.data.plant_traits_data import *

In [None]:
#all columns must be identical to be consider the same species
trait_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
aux_columns = list(
            map(lambda x: x.replace("mean", "sd"), trait_columns)
        )

In [None]:
df_train = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/train.csv')
df_train['path'] = '/home/ubuntu/FGVC11/data/PlantTrait/train_images/' + df_train['id'].astype(str) + '.jpeg'

df_test = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/test.csv')
df_test['path'] = '/home/ubuntu/FGVC11/data/PlantTrait/test_images/' + df_test['id'].astype(str) + '.jpeg'

In [None]:
metadata_cols = df_train.drop(
                columns=["id", "path"] + trait_columns + aux_columns
            ).columns

### Cleaning DataFrames

In [None]:
for col in trait_columns:
    upper_quantile = df_train[col].quantile(0.98)  
    df_train = df_train[(df_train[col] < upper_quantile)]
    df_train = df_train[(df_train[col] > 0)]  

### Train/Val/Test Splits

In [None]:
# add species column
df_train['species'] = df_train.groupby(trait_columns).ngroup()
df_train['species'] = df_train['species'].astype(str)
species_counts = df_train['species'].nunique()

print (f"{species_counts} unique species found in {len(df_train)} records")

In [None]:
df_train['species'].nunique()

In [None]:
# add a split column and do train_testsplit based on species column
df_train['split'] = 'train'
# create a dictionary to store the indices of each species
species_indices = {}

# iterate over each species and select 20% of its indices for validation
for species in tqdm(df_train['species'].unique()):
    species_indices[species] = np.random.choice(df_train[df_train['species'] == species].index, 
                                                size=int(len(df_train[df_train['species'] == species]) * 0.3), 
                                                replace=False)

# update the split column for the selected validation indices
df_train.loc[np.concatenate(list(species_indices.values())), 'split'] = 'val'


In [None]:
df_train['split'].value_counts()

In [None]:
df_test["split"] = "test"
df_full = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

In [None]:
# fill NA in metadata columns
df_full.fillna(0, inplace=True)

### Normalize Metadata Columns

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

In [None]:
df_full[metadata_cols] = scale.fit_transform(df_full[metadata_cols])

In [None]:
df_full.to_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_processed.csv', index=False)

### Label Encoder

In [None]:
from fgvc.models.plant_traits_model import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
df_train[trait_columns].hist(bins=50, figsize=(10, 8))
plt.show()
X = torch.Tensor(df_train[trait_columns].values)
t = le.transform(X)
t = pd.DataFrame(t, columns=trait_columns)
t.hist(bins=50, figsize=(10, 8))
plt.show()