In [1]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import omegaconf
import hydra
from lightning import Callback, LightningDataModule, LightningModule, Trainer
from omegaconf import DictConfig
from sklearn.preprocessing import StandardScaler
from fgvc.data.plant_traits_data import *

In [3]:
#all columns must be identical to be consider the same species
trait_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
aux_columns = list(
            map(lambda x: x.replace("mean", "sd"), trait_columns)
        )

In [4]:
df_full = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_processed.csv')
df_full[trait_columns].describe()

Unnamed: 0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
count,55554.0,55554.0,55554.0,55554.0,55554.0,55554.0
mean,0.452789,13.907804,2.252462,1.393314,16.969752,1300.792687
std,0.209653,8.189753,4.04912,0.748973,49.908438,1951.040578
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.368824,8.590398,0.20196,1.022802,0.202938,143.250464
50%,0.473778,14.343725,0.510616,1.393763,1.45699,519.442038
75%,0.598398,18.934532,2.013909,1.806646,8.489919,1605.33266
max,0.8566,40.478892,23.294858,3.584879,492.559161,12978.600313


In [None]:
df_train = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/train.csv')
df_train['path'] = '/home/ubuntu/FGVC11/data/PlantTrait/train_images/' + df_train['id'].astype(str) + '.jpeg'

df_test = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/test.csv')
df_test['path'] = '/home/ubuntu/FGVC11/data/PlantTrait/test_images/' + df_test['id'].astype(str) + '.jpeg'

In [None]:
metadata_cols = df_train.drop(
                columns=["id", "path"] + trait_columns + aux_columns
            ).columns

### Cleaning DataFrames

In [None]:
df_train[trait_columns].describe()

In [None]:
# log_columns = [i+"_log" for i in trait_columns]
# df_train[log_columns] = np.log10(df_train[trait_columns])
for col in trait_columns:
    # lower quantile
    lower_quantile = df_train[col].quantile(0.005)
    upper_quantile = df_train[col].quantile(0.99)  
    df_train = df_train[(df_train[col] < upper_quantile)]
    df_train = df_train[(df_train[col] > lower_quantile)]
    # df_train = df_train[(df_train[col] > 0)]  

In [None]:
df_train[trait_columns].describe()

In [None]:
df_train[aux_columns].describe()

In [None]:
df_train.dropna(subset=log_columns, inplace=True)

### Train/Val/Test Splits

In [None]:
# add species column
df_train['species'] = df_train.groupby(trait_columns).ngroup()
df_train['species'] = df_train['species'].astype(str)
species_counts = df_train['species'].nunique()

print (f"{species_counts} unique species found in {len(df_train)} records")

In [None]:
df_train['species'].nunique()

In [None]:
# add a split column and do train_testsplit based on species column
df_train['split'] = 'train'
# create a dictionary to store the indices of each species
species_indices = {}

# iterate over each species and select 20% of its indices for validation
for species in tqdm(df_train['species'].unique()):
    species_indices[species] = np.random.choice(df_train[df_train['species'] == species].index, 
                                                size=int(len(df_train[df_train['species'] == species]) * 0.3), 
                                                replace=False)

# update the split column for the selected validation indices
df_train.loc[np.concatenate(list(species_indices.values())), 'split'] = 'val'


In [None]:
df_train['split'].value_counts()

In [None]:
df_test["split"] = "test"
df_full = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

In [None]:
# fill NA in metadata columns
df_full.fillna(0, inplace=True)

In [None]:
df_full['split'].value_counts()

### Normalize Metadata Columns

In [None]:
# # Shift each column in metadata_cols so all values are positive
# for col in metadata_cols:
#     min_value = df_full[col].min()
#     if min_value < 0:
#         df_full[col] += abs(min_value)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scale = StandardScaler()

In [None]:
df_full[metadata_cols] = scale.fit_transform(df_full[metadata_cols])

In [None]:
df_full[df_full.isnull().any(axis=1)]

In [None]:
df_full.to_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_processed_v2.csv', index=False)

### Label Encoder

In [None]:
[i for i in np.log10(df_full[trait_columns][df_full.split != "test"] + 1e-6).max()]

In [None]:
from fgvc.models.plant_traits_model import LabelEncoder, MinMaxLabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
df_full[trait_columns][df_full.split != "test"].hist(bins=50, figsize=(10, 8))
plt.show()
X = torch.Tensor(df_full[trait_columns][df_full.split != "test"].values)
t = le.transform(X)
t = pd.DataFrame(t, columns=trait_columns)
t.hist(bins=50, figsize=(10, 8))
plt.show()

In [None]:
df_full.columns[1:164]

In [None]:
df_full = pd.read_csv('/home/ubuntu/FGVC11/data/PlantTrait/df_processed.csv')

In [None]:
df_full[trait_columns].descirb()

In [None]:
df_species_traits = df_full.groupby('species')[trait_columns].first().reset_index()

In [None]:
df_species_traits.to_csv('/home/ubuntu/FGVC11/data/PlantTrait/species_traits.csv', index=False)

In [None]:
specie_traits = torch.tensor(df_species_traits[trait_columns].values, dtype=torch.float32)

In [None]:
torch.save(specie_traits, '/home/ubuntu/FGVC11/data/PlantTrait/specie_traits.pt')

In [None]:
specie_traits[a]

In [None]:
df_full["species"][df_full.split == "val"].value_counts()

In [None]:
min(df_full["species"][df_full.split != "test"].unique()), max(df_full["species"][df_full.split != "test"].unique())

In [None]:
(df_full["species"][df_full.split != "test"].nunique())