In [None]:
%env CUDA_VISIBLE_DEVICES=1

In [1]:
import torch

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Hyperparameters

In [4]:
LR = 5e-5
EPOCHS = 3
BATCH_SIZE = 16
SUFFIX = '-resnet50'

# Load data

In [129]:
! du -h /sise/mickyfi-group/kapond/output/images/

384K	/sise/mickyfi-group/kapond/output/images/.ipynb_checkpoints
9.8G	/sise/mickyfi-group/kapond/output/images/


In [5]:
from era_data import TabletEraDataset, get_IDS
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
import pandas as pd

In [6]:
IDS = get_IDS(era=True)
len(IDS)

97012

In [7]:
VERSION_NAME = f'era_clf_bs{BATCH_SIZE}_lr{LR}_{EPOCHS}epochs{SUFFIX}-{len(IDS)}_samples'
VERSION_NAME

'era_clf_bs16_lr5e-05_3epochs-resnet50-97012_samples'

In [8]:
train_ids, test_ids = train_test_split(IDS, test_size=500, random_state=0)
len(train_ids), len(test_ids)

(96512, 500)

In [9]:
! pwd

/sise/mickyfi-group/kapond


In [10]:
ds_train = TabletEraDataset(IDS=train_ids)
ds_test = TabletEraDataset(IDS=test_ids)

Filtering 103384 IDS down to provided 96512...
Filtering 103384 IDS down to provided 500...


In [11]:
dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

In [12]:
# save model IDs so we can keep track of what data it was trained on
pd.Series(train_ids).to_csv(f'output/clf_ids/era-train-{VERSION_NAME}.csv', index=False, header=None)
pd.Series(test_ids).to_csv(f'output/clf_ids/era-test-{VERSION_NAME}.csv', index=False, header=None)

# Create Model

In [13]:
from era_model import EraClassifier
from torchinfo import summary

In [14]:
model = EraClassifier(LR=LR)

  f"The parameter '{pretrained_param}' is deprecated since 0.13 and may be removed in the future, "


In [15]:
summary(model, input_size=(BATCH_SIZE, 512, 512))

Layer (type:depth-idx)                        Output Shape              Param #
EraClassifier                                 [16, 3]                   --
├─Conv2d: 1-1                                 [16, 3, 512, 512]         6
├─ResNet: 1-2                                 [16, 3]                   --
│    └─Conv2d: 2-1                            [16, 64, 256, 256]        9,408
│    └─BatchNorm2d: 2-2                       [16, 64, 256, 256]        128
│    └─ReLU: 2-3                              [16, 64, 256, 256]        --
│    └─MaxPool2d: 2-4                         [16, 64, 128, 128]        --
│    └─Sequential: 2-5                        [16, 256, 128, 128]       --
│    │    └─Bottleneck: 3-1                   [16, 256, 128, 128]       75,008
│    │    └─Bottleneck: 3-2                   [16, 256, 128, 128]       70,400
│    │    └─Bottleneck: 3-3                   [16, 256, 128, 128]       70,400
│    └─Sequential: 2-6                        [16, 512, 64, 64]         --
│    

# Train Model

In [16]:
# pip install lightning[extra]

In [17]:
# import warnings
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor

In [18]:
# warnings.filterwarnings(
#     "ignore", ".*Trying to infer the `batch_size` from an ambiguous collection.*"
# )

In [19]:
lr_monitor = LearningRateMonitor(logging_interval='step')

In [20]:
logger = pl.loggers.TensorBoardLogger(
    save_dir='.',
    name='lightning_logs',
    version=VERSION_NAME
)

In [21]:
trainer = pl.Trainer(
    max_epochs=EPOCHS,
    accelerator='gpu',
    devices=1,
    auto_select_gpus=True, # use whichever GPU(s) is free
#     profiler='simple',
#     log_every_n_steps=100, # only log metrics for every n'th batch (default: 50)
    val_check_interval=0.2,
    # ^ val_check_interval: if float, % of epoch; if int, # of batches per validation
    
    callbacks=[lr_monitor],
    #logger=logger
    
    # dir name (default: lightning_logs)
    
    # DEBUGGING:
#     fast_dev_run=5, # only use for debugging
#     limit_val_batches=4,
#     limit_train_batches=1000,
)

  "The Trainer argument `auto_select_gpus` has been deprecated in v1.9.0 and will be removed in v2.0.0."
Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
print('Logs to:', VERSION_NAME)

Logs to: era_clf_bs16_lr5e-05_3epochs-resnet50-97012_samples


In [23]:
trainer.fit(model, dl_train, dl_test)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type               | Params
------------------------------------------------------
0 | gray_to_triple | Conv2d             | 6     
1 | core           | ResNet             | 23.5 M
2 | objective      | CrossEntropyLoss   | 0     
3 | train_acc      | MulticlassAccuracy | 0     
4 | val_acc        | MulticlassAccuracy | 0     
------------------------------------------------------
23.5 M    Trainable params
0         Non-trainable params
23.5 M    Total params
94.057    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
