In [1]:
#%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [1]:
from datetime import datetime

import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import LearningRateMonitor, EarlyStopping
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from era_data import TabletPeriodDataset, get_IDS
from era_model import SimpleCNN

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Hyperparameters

In [3]:
LR = 1e-5
EPOCHS = 30
BATCH_SIZE = 16
SUFFIX = '-vanillaCNN'
DATE = datetime.now().strftime("%B%d")
RUN_NAME_SUFFIX = '-preprocessed' # ''
IMG_DIR = 'output/images_preprocessed'
IDS = get_IDS(IMG_DIR=IMG_DIR)
print(len(IDS))
VERSION_NAME = f'period_clf_bs{BATCH_SIZE}_lr{LR}_{EPOCHS}epochs{SUFFIX}-{len(IDS)}_samples{RUN_NAME_SUFFIX}-{DATE}-80-10-10_train-test-val'
VERSION_NAME

94936


'period_clf_bs16_lr1e-05_30epochs-vanillaCNN-94936_samples-preprocessed-April16-80-10-10_train-test-val'

# Load data

In [4]:
#! du -h {IMG_DIR}

In [5]:
train_ids, test_ids = train_test_split(IDS, test_size=.2, random_state=0)
len(train_ids), len(test_ids)

(75948, 18988)

In [6]:
test_ids, val_ids = train_test_split(test_ids, test_size=.5, random_state=0)
len(test_ids), len(val_ids)

(9494, 9494)

In [7]:
ds_train = TabletPeriodDataset(IDS=train_ids, IMG_DIR=IMG_DIR)
ds_val = TabletPeriodDataset(IDS=val_ids, IMG_DIR=IMG_DIR)
ds_test = TabletPeriodDataset(IDS=test_ids, IMG_DIR=IMG_DIR)

Filtering 94936 IDS down to provided 75948...
Filtering 94936 IDS down to provided 9494...
Filtering 94936 IDS down to provided 9494...


In [8]:
def collate_fn(batch):
    data = torch.stack([torch.from_numpy(sample[1]).unsqueeze(0) for sample in batch])
    labels = torch.tensor([sample[2] for sample in batch])

    return data, labels

In [9]:
dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE,collate_fn=collate_fn, shuffle=True, num_workers=4)
dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False, num_workers=4)
dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False, num_workers=4)

In [10]:
# save model IDs so we can keep track of what data it was trained on
pd.Series(train_ids).to_csv(f'output/clf_ids/period-train-{VERSION_NAME}.csv', index=False, header=None)
pd.Series(val_ids).to_csv(f'output/clf_ids/period-val-{VERSION_NAME}.csv', index=False, header=None)
pd.Series(test_ids).to_csv(f'output/clf_ids/period-test-{VERSION_NAME}.csv', index=False, header=None)

In [11]:
num_classes = len(TabletPeriodDataset.PERIOD_INDICES)
num_classes

22

In [12]:
model = SimpleCNN(num_classes=num_classes, learning_rate=LR)

In [13]:
logger = pl.loggers.TensorBoardLogger(
    save_dir='.',
    name='lightning_logs',
    version=VERSION_NAME
)
lr_monitor = LearningRateMonitor(logging_interval='step')

early_stop_callback = EarlyStopping(
    monitor='val_loss',  
    min_delta=0.00001,      
    patience=1,          
    verbose=10,       
    mode='min',
    check_on_train_epoch_end=True
)

trainer = pl.Trainer(
    max_epochs=EPOCHS,
    accelerator='gpu',
    devices='auto',
    callbacks=[lr_monitor, early_stop_callback],
    logger=logger
)


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
print('Logs to:', VERSION_NAME)

Logs to: period_clf_bs16_lr1e-05_30epochs-vanillaCNN-94936_samples-preprocessed-April16-80-10-10_train-test-val


In [15]:
trainer.fit(model, dl_train, dl_val)

  rank_zero_warn(
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name    | Type        | Params
-----------------------------------------
0  | conv1   | Conv2d      | 320   
1  | bn1     | BatchNorm2d | 64    
2  | conv2   | Conv2d      | 18.5 K
3  | bn2     | BatchNorm2d | 128   
4  | conv3   | Conv2d      | 73.9 K
5  | bn3     | BatchNorm2d | 256   
6  | conv4   | Conv2d      | 295 K 
7  | bn4     | BatchNorm2d | 512   
8  | pool    | MaxPool2d   | 0     
9  | dropout | Dropout     | 0     
10 | fc1     | Linear      | 268 M 
11 | fc2     | Linear      | 22.6 K
-----------------------------------------
268 M     Trainable par

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 1.036


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.102 >= min_delta = 1e-05. New best score: 0.934


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.049 >= min_delta = 1e-05. New best score: 0.885


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.034 >= min_delta = 1e-05. New best score: 0.851


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.034 >= min_delta = 1e-05. New best score: 0.817


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.027 >= min_delta = 1e-05. New best score: 0.789


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.029 >= min_delta = 1e-05. New best score: 0.760


Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 1 records. Best score: 0.760. Signaling Trainer to stop.
