In [None]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [None]:
import csv
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pickle

import torch
import hydra
import omegaconf
import pyrootutils
from tqdm import tqdm
from glob import glob

import matplotlib.pyplot as plt
from fgvc.data.plant_clef_data import PlantCLEFDataset, PlantSPECIESDataset

In [None]:
df = pd.read_csv("/home/ubuntu/FGVC11/data/PlantClef/PlantCLEFTrainLQ.csv",  delimiter=";", escapechar="/")

In [None]:
species = df["species_id"].unique()

In [None]:
images = df["path"][df.species_id == species[30]].values

In [None]:
for img in images[:10]:
    im = Image.open(img)
    plt.imshow(im)
    plt.show()
    # brea

In [None]:
sp_df = pd.DataFrame(df.groupby('species_id')["learn_tag"].value_counts())
sp_df.columns = ['count']
sp_df.reset_index(inplace=True)

In [None]:
df.species_id.value_counts()

### Species Dataset

In [None]:
with open("/home/ubuntu/FGVC11/data/PlantClef/le.pkl", "rb") as f: 
    le = pickle.load(f)

In [None]:
ds = PlantSPECIESDataset(
    df=df[df.learn_tag == "train"].reset_index(drop=True), 
    label_encoder=le,
    transform=datamodule.transform,
    n_repeat=10,
    )
len(ds)

In [None]:
data = ds[7806]
plt.imshow(data["image"].permute(1, 2, 0))
plt.show()
data["label"], data["encoded_label"], ds.le.inverse_transform(data["encoded_label"].numpy().reshape(1, -1))

### Mossaic Dataset

In [None]:
from fgvc.data.plant_clef_data import PlantMosaicDataset

In [None]:
with open("/home/ubuntu/FGVC11/data/PlantClef/le.pkl", "rb") as f:
    le = pickle.load(f)

In [None]:
ds = PlantMosaicDataset(df=df, label_encoder=le)

In [None]:
len(df[df.learn_tag == "train"]), len(df[df.learn_tag == "val"]), len(df[df.learn_tag == "val"]), 

In [None]:
len(ds.df)

In [None]:
len(ds)

In [None]:
data = ds[0]
plt.imshow(data['image'].permute(1, 2, 0))

In [None]:
data["encoded_label"], sum(data["encoded_label"])

In [None]:
data["species"]

In [None]:
df.shape

### Setup Training Data

In [None]:
df = pd.read_csv("/home/ubuntu/FGVC11/data/PlantClef/PlantCLEF2024singleplanttrainingdata.csv",  delimiter=";", escapechar="/")

In [None]:
df["path"] = "/home/ubuntu/FGVC11/data/PlantClef/images_train/images_max_side_800/" + df.species_id.astype(str) + "/" + df.image_name

In [None]:
# df["path"] = "/home/ubuntu/FGVC11/data/PlantClef/images_train/PlantCLEF2024/" + df.learn_tag + "/" + df.species_id.astype(str) + "/" + df.image_name
# df.to_csv("/home/ubuntu/FGVC11/data/PlantClef/PlantCLEFTrainHQ.csv", sep=';', index=False, quoting=csv.QUOTE_NONE, escapechar='/')

# df["path"] = "/home/ubuntu/FGVC11/data/PlantClef/images_train/images_max_side_800/" + df.species_id.astype(str) + "/" + df.image_name
# df.to_csv("/home/ubuntu/FGVC11/data/PlantClef/PlantCLEFTrainLQ.csv", sep=';', index=False, quoting=csv.QUOTE_NONE, escapechar='/')

### One Hot Encoder

In [None]:
# build a one hot encoder
y = df['species_id'].values.reshape(-1, 1)
le = OneHotEncoder()
y_trf = le.fit_transform(y)

In [None]:
# check the encoder
i = 10
print(y_trf[i].toarray())
print(y[i], le.inverse_transform(y_trf[i].toarray()))

In [None]:
# save the decoder
with open("/home/ubuntu/FGVC11/data/PlantClef/le.pkl", "wb") as f: 
    pickle.dump(le, f)

In [None]:
y = df['species_id'].values
# load and check teh decoder
with open("/home/ubuntu/FGVC11/data/PlantClef/le.pkl", "rb") as f: 
    le = pickle.load(f)
i = 10000
y_trf = le.transform(y.reshape(-1, 1))
print(y[i], le.inverse_transform(y_trf[i].toarray()))

### Test Dataset Class

In [None]:
64*4*169

In [None]:
len(df)

In [None]:
cfg = omegaconf.OmegaConf.load("/home/ubuntu/FGVC11/configs/data/plant_clef_data.yaml")
datamodule = hydra.utils.instantiate(cfg)
datamodule.setup()

In [None]:
len(datamodule.data_train), len(df)

In [None]:
len(datamodule.data_train)//256 * 50

In [None]:
for batch in datamodule.train_dataloader():
    # print(batch)
    break

In [None]:
plt.imshow(batch["image"][0].permute(1, 2, 0))

In [None]:
sum(batch["encoded_label"][0])

In [None]:
# load and check the decoder
with open("/home/ubuntu/FGVC11/data/PlantClef/le.pkl", "rb") as f: 
    le = pickle.load(f)
i = 10
print(batch["label"][i], le.inverse_transform(batch["encoded_label"])[i])

### Test model Class

In [None]:
cfg = omegaconf.OmegaConf.load("/home/ubuntu/FGVC11/configs/model/plant_clef_model.yaml")
model = hydra.utils.instantiate(cfg)

In [None]:
from torchsummary import summary

### Run Submission

In [None]:
model.load_from_checkpoint("/home/ubuntu/FGVC11/logs/train/runs/test_edgenext_small_cross_entropy_specie_based_dataset_10/checkpoints/epoch_198.ckpt");

In [None]:
submission_df = pd.DataFrame(columns=["path", "plot_id", "species_ids"])
submission_df["path"] = glob("/home/ubuntu/FGVC11/data/PlantClef/images/*.jpg")
submission_df["plot_id"] = submission_df["path"].apply(lambda x: x.split("/")[-1].split(".")[0])

In [None]:
test_ds = PlantCLEFDataset(
    df=submission_df,
    transform=datamodule.test_transform,
    label_encoder=datamodule.le,
    return_image=True,
    return_labels=False,
    return_metadata=False, 
    )

In [None]:
plt.imshow(test_ds[0]["image"].permute(1, 2, 0))

In [None]:
test_dl = torch.utils.data.DataLoader(
    dataset=test_ds,
    batch_size=32,
    shuffle=False,
    num_workers=8,
    pin_memory=True,
    persistent_workers=True,
)

In [None]:
model.to("cuda");
model.eval();

In [None]:
for batch in tqdm(datamodule.test_dataloader()):
    break

In [None]:
batch["encoded_label"].shape

In [None]:
cats = datamodule.le.categories_[0]
test_targets = []
test_preds = []
for batch in tqdm(datamodule.test_dataloader()):
    image = batch["image"].to("cuda")
    test_targets.append(batch["encoded_label"].cpu().numpy())
    with torch.no_grad():
        output = torch.sigmoid(model(image))
    test_preds.append(output.cpu().numpy())
    break

In [None]:
output.shape

In [None]:
# test_targets = np.concatenate(test_targets, axis=0)
# test_preds = np.concatenate(test_preds, axis=0)

In [None]:
# test_preds.shape, test_targets.shape

In [None]:
# from sklearn.metrics import f1_score

In [None]:
# f1_score(test_targets, test_preds > 0.5, average="samples")

In [None]:
# f1_score(test_targets, test_preds, average="samples")

In [None]:
# thresh = []
# f1 = []
# for i in range(test_targets.shape[0]):
#     tgt = test_targets[:, i]
#     prd = test_preds[:, i]
    
    # get the best threshold between 0 and 1 based on the f1 score
    # f1 = []
    # for t in np.linspace(0, 1, 11):
    #     f1.append(f1_score(tgt, prd > t))
    # # print(np.linspace(0, 1, 10)[np.argmax(f1)], f1[np.argmax(f1)])
    # thresh.append(np.linspace(0, 1, 10)[np.argmax(f1)])
    # break

In [None]:
# thresh = [i if i > 0.6 else 0.6 for i in thresh]


In [None]:
# thresh = np.array(thresh)

In [None]:
# plt.hist(thresh)

In [None]:
cats = datamodule.le.categories_[0]
test_labels = []
thresh = 0.65
k = 20
for batch in tqdm(test_dl):
    image = batch["image"].to("cuda")
    with torch.no_grad():
        output = torch.sigmoid(model(image))
    raw_pred = output.cpu().numpy()
    # break
    raw_pred[raw_pred < thresh] = 0
    for obj in raw_pred:
        top_k_indices = np.argsort(obj)[-k:]
        non_zero_indices = np.where(obj[top_k_indices] > 0)[0]
        top_k_indices = top_k_indices[non_zero_indices]
        test_labels.append(str(list(cats[top_k_indices].astype(object))))
    # break

In [None]:
test_labels

In [None]:
submission_df["species_ids"] = test_labels
submission_df[["plot_id", "species_ids"]].to_csv("my_run.csv", sep=';', index=False, quoting=csv.QUOTE_NONE)

In [None]:
y = torch.tensor([[0, 1, 0, 1],[1, 0, 0, 0]]).float()
y_ = torch.tensor([[0.1, 0.9, 0.9, 0.1], [0.1, 0.9, 0.9, 0.1]]).float()
y.shape, y_.shape

In [None]:
torch.argmax(y, dim=1)

In [None]:
def expected_positive_regularizer(preds, expected_num_pos, norm='2'):
    # Assumes predictions in [0,1].
    if norm == '1':
        reg = torch.abs(preds.sum(1).mean(0) - expected_num_pos)
    elif norm == '2':
        reg = (preds.sum(1).mean(0) - expected_num_pos)**2
    else:
        raise NotImplementedError
    return reg

In [None]:
expected_positive_regularizer(y_, 1)

In [None]:
y_.sum(1)

In [None]:
loss = SmoothBCELoss(num_classes=4)

In [None]:
loss.forward(y_, y)

In [None]:
from torchmetrics.classification import MultilabelF1Score

In [None]:
macro_per_sample = MultilabelF1Score(num_labels=4, threshold=0.5, average="macro", multidim_average="samplewise")
macro_per_class = MultilabelF1Score(num_labels=4, threshold=0.5, average="macro", multidim_average="global")

In [None]:
micro_per_sample = MultilabelF1Score(num_labels=4, threshold=0.5, average="micro", multidim_average="samplewise")
micro_per_class = MultilabelF1Score(num_labels=4, threshold=0.5, average="micro", multidim_average="global")

In [None]:
m = MultilabelF1Score(num_labels=4, threshold=0.5, average="macro")

In [None]:
m(y_, y),

In [None]:
df[:100]

In [None]:
micro_per_class(y_, y),

In [None]:
micro_per_sample(y_, y),