In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fungiclef.model.dataset import ImageDataset, EmbeddingDataset
from fungiclef.model.wrapper import FungiModel
from fungiclef.model.transforms import get_transforms
from fungiclef.utils import get_spark, spark_resource, read_config
import pandas as pd

from torch.utils.data import DataLoader
import lightning as L

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = read_config(path='../fungiclef/config.json')


In [4]:
TRAIN_PQ_LOCATION = config['gs_paths']['production']['dataset_resnet']['train']
VALID_PQ_LOCATION = config['gs_paths']['production']['dataset_resnet']['val']
TEST_PQ_LOCATION = config['gs_paths']['production']['dataset_resnet']['test']

In [5]:
# Fetch data parquet
train_df = pd.read_parquet(TRAIN_PQ_LOCATION)
valid_df = pd.read_parquet(VALID_PQ_LOCATION)

In [15]:
# Load it as torch dataset
train_dataset = EmbeddingDataset(train_df)
valid_dataset = EmbeddingDataset(valid_df)

In [16]:
# Define model. Here we use a simple stupid linear layer layer
from fungiclef.model.init_models import init_embedding_classifier

N_CLASSES = len(train_df.class_id.unique()) # This should be 1605 - 1604 classes + 1 unknown class

model = init_embedding_classifier(n_classes=N_CLASSES, embedding_size=1000)

In [17]:
# Load it to dataloader
BATCH_SIZE = 32
# Adjust BATCH_SIZE and ACCUMULATION_STEPS to values that if multiplied results in 64
ACCUMULATION_STEPS = 64 // BATCH_SIZE
EPOCHS = 1
WORKERS = 4

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=3)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=3)

In [24]:
# Use our wrapper module to get a PyTorch Lightning trainer

module = FungiModel(model)
trainer = L.Trainer(accelerator="gpu", max_epochs=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [25]:
trainer.fit(module, train_loader, valid_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type   | Params
---------------------------------
0 | model | Linear | 1.6 M 
---------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.426     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Epoch 0: 100%|██████████| 9855/9855 [00:57<00:00, 171.82it/s, v_num=12]     

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 9855/9855 [00:57<00:00, 171.59it/s, v_num=12]


In [22]:
# Evaluate model on test set
test_df = pd.read_parquet(TEST_PQ_LOCATION)
test_dataset = EmbeddingDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=3)

In [26]:
from fungiclef.evaluate.inference import generate_logits, predict_class
from fungiclef.evaluate.scoring import score_model

logits = generate_logits(model, test_loader)
predicted_class = predict_class(logits)


gt_df = test_df[['observationID', 'class_id']]
score_model(predicted_class, gt_df)


Using device: cuda


  0%|          | 0/618 [00:00<?, ?it/s]

100%|██████████| 618/618 [00:01<00:00, 397.31it/s]


Evaluated scores: {'F1 Score': 10.29, 'Track 1: Classification Error': 0.8272, 'Track 2: Cost for Poisonousness Confusion': 1.7833, 'Track 3: User-Focused Loss': 2.6104, 'Track 4: Classification Error with Special Cost for Unknown': 0.818}


[{'test_split': {'F1 Score': 10.29,
   'Track 1: Classification Error': 0.8272,
   'Track 2: Cost for Poisonousness Confusion': 1.7833,
   'Track 3: User-Focused Loss': 2.6104,
   'Track 4: Classification Error with Special Cost for Unknown': 0.818}}]