# Notebook for Training & Tracking Models using NeptuneAI


In [None]:
!pip install torch torchvision --q

In [None]:
!pip install timm wwf fastai neptune neptune-fastai --q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.5/481.5 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.9/137.9 kB[0m [31m6.8 M

In [None]:
import os

# load data set & unzip
os.environ['KAGGLE_USERNAME'] = 'felixn95'
os.environ['KAGGLE_KEY'] = 'INSERT-KEY'

!kaggle competitions download -c widsdatathon2019
!unzip -q widsdatathon2019.zip
!unzip -q train_images.zip
!unzip -q leaderboard_test_data.zip

# Define Paths
train_images = '/content/train_images'
test_images =  '/content/leaderboard_test_data'
train_labels = '/content/traininglabels.csv'
test_labels = '/content/testlabels.csv'


Downloading widsdatathon2019.zip to /content
 98% 449M/460M [00:04<00:00, 102MB/s]
100% 460M/460M [00:05<00:00, 95.9MB/s]


The labeled images in the labels csv's do not fit to the image ids, because we need to remove the year at the end of the string in each row.

In [None]:
import pandas as pd
import os

def adjust_image_ids(input_csv_path, output_csv_path):
    """
    Adjusts the image_ids in the input CSV file by removing the year digits before '.jpg'
    and saves the modified DataFrame to the output CSV path.
    """

    df = pd.read_csv(input_csv_path)

    # Modify the 'image_id' column
    df['image_id'] = df['image_id'].str.replace(r'\d{4}\.jpg', '.jpg', regex=True)

    # Save to the output CSV file
    df.to_csv(output_csv_path, index=False)


adjust_image_ids(train_labels, 'traininglabels_processed.csv')
adjust_image_ids(test_labels, 'testlabels_processed.csv')

train_labels = '/content/traininglabels_processed.csv'
test_labels = '/content/testlabels_processed.csv'

def filter_missing_images(csv_file, image_dir, target_file):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Filter out rows where the image file does not exist
    df_filtered = df[df['image_id'].apply(lambda x: os.path.exists(os.path.join(image_dir, x)))]

    # Save the filtered DataFrame
    df_filtered.to_csv(target_file, index=False)

filter_missing_images(train_labels, train_images, '/content/train_labels_filtered.csv')
filter_missing_images(test_labels, test_images, '/content/test_labels_filtered.csv')

train_labels = '/content/train_labels_filtered.csv'
test_labels = '/content/test_labels_filtered.csv'


# Training & Tracking

In [None]:
import os
import neptune
import torch
import wwf
import fastai
import timm
from wwf.vision.timm import timm_learner
from fastai.callback.all import SaveModelCallback
from fastai.vision.all import *
from neptune.integrations.fastai import NeptuneCallback
from neptune.types import File

def get_x(row, train_images):
    return os.path.join(train_images, row['image_id'])

def get_y(row):
    return row['has_oilpalm']

def create_dataloader(item_size, batch_size=64):
    # Create data block
    palm_data_block = DataBlock(
        blocks=(ImageBlock, CategoryBlock),
        get_x=partial(get_x, train_images=train_images),
        get_y=get_y,
        splitter=RandomSplitter(valid_pct=0.2, seed=42),
        item_tfms=Resize(item_size),
        batch_tfms=aug_transforms()
    )

    # Create data loaders
    dls = palm_data_block.dataloaders(pd.read_csv(train_labels), bs=batch_size)
    return dls


## Created Methods for easier Training and Tracking

In [None]:
# Method to train and track a model with a specified model and the number of epochs
def train_model_with_neptune(model, epochs, item_size, project_name, tag):

    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2ZjExOTA0Yy0zZjhhLTQ3Y2MtYjYzNy1kY2Q1OGMwODAyYWUifQ=="
    # Initialize a run
    run = neptune.init_run(project=project_name, api_token=api_token, tags=tag)

    # Neptune callback object
    neptune_cbk = NeptuneCallback(run=run)

    # Create dls
    dls = create_dataloader(item_size)

    # Create a FastAI learner
    learner = vision_learner(dls, arch=model, cbs=[neptune_cbk], metrics=accuracy)

    # Train the model
    learner.fit_one_cycle(epochs)

    # Stop the Neptune run
    run.stop()

    learner.remove_cb(neptune_cbk)  # remove NeptuneCallback
    learner.export(f"./{tag}")  # export learner

def train_multiple_models_with_neptune(models, epochs_list, item_size, project_name):
    for model in models:
        for epochs in epochs_list:
            # Constructs a unique tag for each training run (i.e resnet18_3_epochs)
            tag = f'{model}_{epochs}_epochs'

            # Call the training function with the current model and epoch count
            train_model_with_neptune(model, epochs, item_size, project_name, tag)

In [None]:
# Available pretrained models from fast.ai
model_set = {
    "resnet18", "resnet34", "resnet50", "resnet101", "resnet152",
    "squeezenet1_0", "squeezenet1_1",
    "densenet121", "densenet169", "densenet201", "densenet161",
    "vgg16_bn", "vgg19_bn",
    "alexnet"
}

In [None]:
item_size = 256

models = ["resnet50"]  # example models
epochs_list = [3]  # example epochs
project_name = 'oilpalm-detection'

train_multiple_models_with_neptune(models, epochs_list, item_size, project_name)


  run = neptune.init_run(project=project_name, api_token=api_token, tags=tag)


https://app.neptune.ai/tds-ws23/oilpalm-detection/e/OIL-41


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]



epoch,train_loss,valid_loss,accuracy,time
0,0.628126,0.312072,0.894387,02:04
1,0.289648,0.222957,0.930588,02:03
2,0.258947,0.21903,0.931916,02:04


Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 33 operations to synchronize with Neptune. Do not kill this process.
All 33 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/tds-ws23/oilpalm-detection/e/OIL-41/metadata
