In [1]:
#%env CUDA_VISIBLE_DEVICES=0

In [2]:
from datetime import datetime

import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import Callback, LearningRateMonitor
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch
from torch.utils.data import DataLoader
from torchinfo import summary
from sklearn.model_selection import train_test_split

from era_data import TabletPeriodDataset, get_IDS
from era_model import EraClassifier  # also used for periods

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Hyperparameters

In [4]:
LR = 5e-5
EPOCHS = 15
BATCH_SIZE = 16
SUFFIX = '-resnet50'
DATE = datetime.now().strftime("%B%d")
RUN_NAME_SUFFIX = '_preprocessed-masked' # ''
IMG_DIR = 'output/images_preprocessed'
IDS = get_IDS(IMG_DIR=IMG_DIR)
print(len(IDS))
VERSION_NAME = f'period_clf_bs{BATCH_SIZE}_lr{LR}_{EPOCHS}epochs{SUFFIX}-{len(IDS)}_samples{RUN_NAME_SUFFIX}_{DATE}-1000_test_val'
print(VERSION_NAME)

94936
period_clf_bs16_lr5e-05_15epochs-resnet50-94936_samples_preprocessed-masked_March29-1000_test_val


# Load data

In [5]:
#! du -h {IMG_DIR}

In [6]:
train_ids, test_ids = train_test_split(IDS, test_size=1000, random_state=0)
len(train_ids), len(test_ids)

(93936, 1000)

In [7]:
train_ids, val_ids = train_test_split(train_ids, test_size=1000, random_state=0)
len(train_ids), len(val_ids)

(92936, 1000)

In [8]:
ds_train = TabletPeriodDataset(IDS=train_ids, IMG_DIR=IMG_DIR, mask=True)
ds_val = TabletPeriodDataset(IDS=val_ids, IMG_DIR=IMG_DIR, mask=True)
ds_test = TabletPeriodDataset(IDS=test_ids, IMG_DIR=IMG_DIR, mask=True)

Filtering 94936 IDS down to provided 92936...
Filtering 94936 IDS down to provided 1000...
Filtering 94936 IDS down to provided 1000...


In [9]:
def collate_fn(batch):
    data = torch.stack([torch.from_numpy(sample[1]) for sample in batch])
    labels = torch.tensor([sample[2] for sample in batch])

    return data, labels

In [10]:
dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True, num_workers=4)
dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False, num_workers=4)
dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False, num_workers=4)

In [11]:
# save model IDs so we can keep track of what data it was trained on
pd.Series(train_ids).to_csv(f'output/clf_ids/period-train-{VERSION_NAME}.csv', index=False, header=None)
pd.Series(val_ids).to_csv(f'output/clf_ids/period-val-{VERSION_NAME}.csv', index=False, header=None)
pd.Series(test_ids).to_csv(f'output/clf_ids/period-test-{VERSION_NAME}.csv', index=False, header=None)

# Create Model

In [12]:
num_classes = len(TabletPeriodDataset.PERIOD_INDICES) + 2
num_classes

24

In [13]:
model = EraClassifier(LR=LR, num_classes=num_classes)



In [14]:
summary(model, input_size=(BATCH_SIZE, 512, 512))

Layer (type:depth-idx)                        Output Shape              Param #
EraClassifier                                 [16, 24]                  --
├─Conv2d: 1-1                                 [16, 3, 512, 512]         6
├─ResNet: 1-2                                 [16, 24]                  --
│    └─Conv2d: 2-1                            [16, 64, 256, 256]        9,408
│    └─BatchNorm2d: 2-2                       [16, 64, 256, 256]        128
│    └─ReLU: 2-3                              [16, 64, 256, 256]        --
│    └─MaxPool2d: 2-4                         [16, 64, 128, 128]        --
│    └─Sequential: 2-5                        [16, 256, 128, 128]       --
│    │    └─Bottleneck: 3-1                   [16, 256, 128, 128]       75,008
│    │    └─Bottleneck: 3-2                   [16, 256, 128, 128]       70,400
│    │    └─Bottleneck: 3-3                   [16, 256, 128, 128]       70,400
│    └─Sequential: 2-6                        [16, 512, 64, 64]         --
│    

# Train Model

In [15]:
lr_monitor = LearningRateMonitor(logging_interval='step')

In [16]:
logger = pl.loggers.TensorBoardLogger(
    save_dir='.',
    name='lightning_logs',
    version=VERSION_NAME
)

In [17]:
early_stop_callback = EarlyStopping(
    monitor='val_loss',  
    min_delta=0.00001,      
    patience=3,          
    verbose=10,       
    mode='min',
    check_on_train_epoch_end=True
)

In [18]:
class PrintMetricsCallback(Callback):
    def on_validation_epoch_end(self, trainer, pl_module):
        metrics = trainer.callback_metrics
        train_loss = metrics.get('train_loss', torch.tensor(0.0)).item()
        val_loss = metrics.get('val_loss', torch.tensor(0.0)).item()
        train_acc = metrics.get('train_acc', torch.tensor(0.0)).item()
        val_acc = metrics.get('val_acc', torch.tensor(0.0)).item()

        print(f"\nEpoch {trainer.current_epoch} Metrics:")
        print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
              f"Train Acc: {train_acc * 100:.2f}%, Val Acc: {val_acc * 100:.2f}%")

In [19]:
trainer = pl.Trainer(
    max_epochs=EPOCHS,
    accelerator='gpu',
    devices='auto',
    val_check_interval=0.2,
    callbacks=[lr_monitor, early_stop_callback, PrintMetricsCallback()],
    logger=logger
)    

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [20]:
print('Logs to:', VERSION_NAME)

Logs to: period_clf_bs16_lr5e-05_15epochs-resnet50-94936_samples_preprocessed-masked_March29-1000_test_val


In [21]:
trainer.fit(model, dl_train, dl_val)

  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type               | Params
------------------------------------------------------
0 | gray_to_triple | Conv2d             | 6     
1 | core           | ResNet             | 23.6 M
2 | objective      | CrossEntropyLoss   | 0     
3 | train_acc      | MulticlassAccuracy | 0     
4 | val_acc        | MulticlassAccuracy | 0     
------------------------------------------------------
23.6 M    Trainable params
0         Non-trainable params
23.6 M    Total params
94.229    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 3.2530, Train Acc: 0.00%, Val Acc: 0.00%


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 1.0626, Train Acc: 0.00%, Val Acc: 42.53%


Validation: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 0.9562, Train Acc: 0.00%, Val Acc: 46.27%


Validation: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 0.9596, Train Acc: 0.00%, Val Acc: 46.24%


Validation: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 0.9207, Train Acc: 0.00%, Val Acc: 49.99%


Validation: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 0.8845, Train Acc: 0.00%, Val Acc: 50.04%


Metric val_loss improved. New best score: 0.885


Validation: 0it [00:00, ?it/s]


Epoch 1 Metrics:
Train Loss: 1.0464, Val Loss: 0.9060, Train Acc: 50.25%, Val Acc: 47.51%


Validation: 0it [00:00, ?it/s]


Epoch 1 Metrics:
Train Loss: 1.0464, Val Loss: 0.8821, Train Acc: 50.25%, Val Acc: 49.23%


Validation: 0it [00:00, ?it/s]


Epoch 1 Metrics:
Train Loss: 1.0464, Val Loss: 0.8344, Train Acc: 50.25%, Val Acc: 51.58%


Validation: 0it [00:00, ?it/s]


Epoch 1 Metrics:
Train Loss: 1.0464, Val Loss: 0.8132, Train Acc: 50.25%, Val Acc: 54.68%


Validation: 0it [00:00, ?it/s]


Epoch 1 Metrics:
Train Loss: 1.0464, Val Loss: 0.7966, Train Acc: 50.25%, Val Acc: 55.29%


Metric val_loss improved by 0.088 >= min_delta = 1e-05. New best score: 0.797


Validation: 0it [00:00, ?it/s]


Epoch 2 Metrics:
Train Loss: 0.8342, Val Loss: 0.8607, Train Acc: 57.58%, Val Acc: 49.66%


Validation: 0it [00:00, ?it/s]


Epoch 2 Metrics:
Train Loss: 0.8342, Val Loss: 0.8303, Train Acc: 57.58%, Val Acc: 52.43%


Validation: 0it [00:00, ?it/s]


Epoch 2 Metrics:
Train Loss: 0.8342, Val Loss: 0.8675, Train Acc: 57.58%, Val Acc: 47.18%


Validation: 0it [00:00, ?it/s]


Epoch 2 Metrics:
Train Loss: 0.8342, Val Loss: 0.8278, Train Acc: 57.58%, Val Acc: 52.06%


Validation: 0it [00:00, ?it/s]


Epoch 2 Metrics:
Train Loss: 0.8342, Val Loss: 0.8063, Train Acc: 57.58%, Val Acc: 51.32%


Validation: 0it [00:00, ?it/s]


Epoch 3 Metrics:
Train Loss: 0.7286, Val Loss: 0.8253, Train Acc: 61.61%, Val Acc: 56.06%


Validation: 0it [00:00, ?it/s]


Epoch 3 Metrics:
Train Loss: 0.7286, Val Loss: 0.8741, Train Acc: 61.61%, Val Acc: 55.46%


Validation: 0it [00:00, ?it/s]


Epoch 3 Metrics:
Train Loss: 0.7286, Val Loss: 0.8150, Train Acc: 61.61%, Val Acc: 57.45%


Validation: 0it [00:00, ?it/s]


Epoch 3 Metrics:
Train Loss: 0.7286, Val Loss: 0.7728, Train Acc: 61.61%, Val Acc: 51.75%


Validation: 0it [00:00, ?it/s]


Epoch 3 Metrics:
Train Loss: 0.7286, Val Loss: 0.8699, Train Acc: 61.61%, Val Acc: 46.62%


Validation: 0it [00:00, ?it/s]


Epoch 4 Metrics:
Train Loss: 0.6310, Val Loss: 0.8679, Train Acc: 65.51%, Val Acc: 49.75%


Validation: 0it [00:00, ?it/s]


Epoch 4 Metrics:
Train Loss: 0.6310, Val Loss: 0.8088, Train Acc: 65.51%, Val Acc: 51.48%


Validation: 0it [00:00, ?it/s]


Epoch 4 Metrics:
Train Loss: 0.6310, Val Loss: 0.8355, Train Acc: 65.51%, Val Acc: 52.22%


Validation: 0it [00:00, ?it/s]


Epoch 4 Metrics:
Train Loss: 0.6310, Val Loss: 0.8237, Train Acc: 65.51%, Val Acc: 52.88%


Validation: 0it [00:00, ?it/s]


Epoch 4 Metrics:
Train Loss: 0.6310, Val Loss: 0.8399, Train Acc: 65.51%, Val Acc: 58.32%


Monitored metric val_loss did not improve in the last 3 records. Best score: 0.797. Signaling Trainer to stop.
