In [1]:
#%env CUDA_VISIBLE_DEVICES=0

In [1]:
from datetime import datetime

import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import Callback, LearningRateMonitor
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch
from torch.utils.data import DataLoader
from torchinfo import summary
from sklearn.model_selection import train_test_split

from era_data import TabletPeriodDataset, get_IDS
from era_model import EraClassifier  # also used for periods

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Hyperparameters

In [3]:
LR = 1e-5
EPOCHS = 20
BATCH_SIZE = 16
SUFFIX = '-resnet50'
DATE = datetime.now().strftime("%B%d")
RUN_NAME_SUFFIX = '_preprocessed-masked' # ''
IMG_DIR = 'output/images_preprocessed'
IDS = get_IDS(IMG_DIR=IMG_DIR)
print(len(IDS))
VERSION_NAME = f'period_clf_bs{BATCH_SIZE}_lr{LR}_{EPOCHS}epochs{SUFFIX}-{len(IDS)}_samples{RUN_NAME_SUFFIX}_{DATE}-80-10-10_train_test_val-2'
print(VERSION_NAME)

94936
period_clf_bs16_lr1e-05_20epochs-resnet50-94936_samples_preprocessed-masked_April16-80-10-10_train_test_val-2


# Load data

In [4]:
#! du -h {IMG_DIR}

In [5]:
train_ids, test_ids = train_test_split(IDS, test_size=.2, random_state=0)
len(train_ids), len(test_ids)

(75948, 18988)

In [6]:
test_ids, val_ids = train_test_split(test_ids, test_size=.5, random_state=0)
len(test_ids), len(val_ids)

(9494, 9494)

In [7]:
ds_train = TabletPeriodDataset(IDS=train_ids, IMG_DIR=IMG_DIR, mask=True)
ds_val = TabletPeriodDataset(IDS=val_ids, IMG_DIR=IMG_DIR, mask=True)
ds_test = TabletPeriodDataset(IDS=test_ids, IMG_DIR=IMG_DIR, mask=True)

Filtering 94936 IDS down to provided 75948...
Filtering 94936 IDS down to provided 9494...
Filtering 94936 IDS down to provided 9494...


In [8]:
def collate_fn(batch):
    data = torch.stack([torch.from_numpy(sample[1]) for sample in batch])
    labels = torch.tensor([sample[2] for sample in batch])

    return data, labels

In [9]:
dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True, num_workers=4)
dl_val = DataLoader(ds_val, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False, num_workers=4)
dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False, num_workers=4)

In [10]:
# save model IDs so we can keep track of what data it was trained on
pd.Series(train_ids).to_csv(f'output/clf_ids/period-train-{VERSION_NAME}.csv', index=False, header=None)
pd.Series(val_ids).to_csv(f'output/clf_ids/period-val-{VERSION_NAME}.csv', index=False, header=None)
pd.Series(test_ids).to_csv(f'output/clf_ids/period-test-{VERSION_NAME}.csv', index=False, header=None)

# Create Model

In [11]:
num_classes = len(TabletPeriodDataset.PERIOD_INDICES) + 2
num_classes

24

In [12]:
model = EraClassifier(LR=LR, num_classes=num_classes)



In [13]:
summary(model, input_size=(BATCH_SIZE, 512, 512))

Layer (type:depth-idx)                        Output Shape              Param #
EraClassifier                                 [16, 24]                  --
├─Conv2d: 1-1                                 [16, 3, 512, 512]         6
├─ResNet: 1-2                                 [16, 24]                  --
│    └─Conv2d: 2-1                            [16, 64, 256, 256]        9,408
│    └─BatchNorm2d: 2-2                       [16, 64, 256, 256]        128
│    └─ReLU: 2-3                              [16, 64, 256, 256]        --
│    └─MaxPool2d: 2-4                         [16, 64, 128, 128]        --
│    └─Sequential: 2-5                        [16, 256, 128, 128]       --
│    │    └─Bottleneck: 3-1                   [16, 256, 128, 128]       75,008
│    │    └─Bottleneck: 3-2                   [16, 256, 128, 128]       70,400
│    │    └─Bottleneck: 3-3                   [16, 256, 128, 128]       70,400
│    └─Sequential: 2-6                        [16, 512, 64, 64]         --
│    

# Train Model

In [14]:
lr_monitor = LearningRateMonitor(logging_interval='step')

In [15]:
logger = pl.loggers.TensorBoardLogger(
    save_dir='.',
    name='lightning_logs',
    version=VERSION_NAME
)

In [16]:
early_stop_callback = EarlyStopping(
    monitor='val_loss',  
    min_delta=0.00001,      
    patience=1,          
    verbose=10,       
    mode='min',
    check_on_train_epoch_end=True
)

In [17]:
class PrintMetricsCallback(Callback):
    def on_validation_epoch_end(self, trainer, pl_module):
        metrics = trainer.callback_metrics
        train_loss = metrics.get('train_loss', torch.tensor(0.0)).item()
        val_loss = metrics.get('val_loss', torch.tensor(0.0)).item()
        train_acc = metrics.get('train_acc', torch.tensor(0.0)).item()
        val_acc = metrics.get('val_acc', torch.tensor(0.0)).item()

        print(f"\nEpoch {trainer.current_epoch} Metrics:")
        print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
              f"Train Acc: {train_acc * 100:.2f}%, Val Acc: {val_acc * 100:.2f}%")

In [12]:
# def evaluate_model(model, val_loader):
#     model.to(device)
#     model.eval() 
#     true_labels = []
#     predictions = []
    
#     with torch.no_grad():
#         for data, targets in val_loader:
#             data, targets = data.to(device), targets.to(device)
#             outputs = model(data)
#             _, predicted = torch.max(outputs.data, 1)
#             true_labels.extend(targets.cpu().numpy())
#             predictions.extend(predicted.cpu().numpy())
            
#     return true_labels, predictions

# def save_fold_report(report, filepath=f'output/{VERSION_NAME}_fold_reports.json'):
#     try:
#         with open(filepath, 'r+') as file:
#             data = json.load(file)
#             data.append(report)
#             file.seek(0)
#             json.dump(data, file, indent=4)
#     except FileNotFoundError:
#         with open(filepath, 'w') as file:
#             json.dump([report], file, indent=4)

In [13]:
# from sklearn.model_selection import KFold
# from tqdm.notebook import tqdm
# from torch.utils.data import DataLoader, SubsetRandomSampler
# import json

# kf = KFold(n_splits=10, shuffle=True, random_state=42)  # For reproducibility
# fold_performance = []

# for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(train_ids), total=kf.get_n_splits(), desc="Folds Progress")):

#     train_loader = DataLoader(ds_train, sampler=SubsetRandomSampler(train_idx), collate_fn=collate_fn, batch_size=BATCH_SIZE, num_workers=16)
#     val_loader = DataLoader(ds_train, sampler=SubsetRandomSampler(val_idx), collate_fn=collate_fn, batch_size=BATCH_SIZE, num_workers=16)
    
#     model = EraClassifier(LR=LR, num_classes=num_classes)    
#     fold_logger = pl.loggers.TensorBoardLogger(save_dir='.', name='lightning_logs', version=f"{VERSION_NAME}_fold_{fold}-4")
#     trainer = pl.Trainer(
#         max_epochs=EPOCHS,
#         accelerator='gpu',
#         devices='auto',
#         val_check_interval=0.2,
#         callbacks=[lr_monitor, early_stop_callback],
#         logger=fold_logger
#     )
    
#     # Train the model
#     trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
    
#     # Evaluate and store performance metrics for this fold
#     performance = evaluate_model(model, val_loader)
#     fold_performance.append(performance)
    
#     true_labels, predictions = evaluate_model(model, val_loader)
        
#     # Generate classification report
#     report_dict = classification_report(true_labels, predictions, output_dict=True)
    
#     # Optionally, convert the report to include the fold number or other details
#     report_with_details = {
#         'fold': fold,
#         'report': report_dict
#     }
    
#     # Save this fold's report
#     save_fold_report(report_with_details)

# # Analyze cross-validation results
# average_performance = np.mean(fold_performance)
# print(f"Average Performance over 10 folds: {average_performance}")

In [18]:
trainer = pl.Trainer(
    max_epochs=EPOCHS,
    accelerator='gpu',
    devices='auto',
    val_check_interval=0.3,
    callbacks=[lr_monitor, early_stop_callback, PrintMetricsCallback()],
    logger=logger
)    

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [19]:
print('Logs to:', VERSION_NAME)

Logs to: period_clf_bs16_lr1e-05_20epochs-resnet50-94936_samples_preprocessed-masked_April16-80-10-10_train_test_val-2


In [20]:
trainer.fit(model, dl_train, dl_val)

  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type               | Params
------------------------------------------------------
0 | gray_to_triple | Conv2d             | 6     
1 | core           | ResNet             | 23.6 M
2 | objective      | CrossEntropyLoss   | 0     
3 | train_acc      | MulticlassAccuracy | 0     
4 | val_acc        | MulticlassAccuracy | 0     
------------------------------------------------------
23.6 M    Trainable params
0         Non-trainable params
23.6 M    Total params
94.229    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 3.2165, Train Acc: 0.00%, Val Acc: 0.00%


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 1.1818, Train Acc: 0.00%, Val Acc: 46.20%


Validation: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 1.0707, Train Acc: 0.00%, Val Acc: 47.63%


Validation: 0it [00:00, ?it/s]


Epoch 0 Metrics:
Train Loss: 0.0000, Val Loss: 1.0124, Train Acc: 0.00%, Val Acc: 50.05%


Metric val_loss improved. New best score: 1.012


Validation: 0it [00:00, ?it/s]


Epoch 1 Metrics:
Train Loss: 1.1872, Val Loss: 0.9516, Train Acc: 46.01%, Val Acc: 49.92%


Validation: 0it [00:00, ?it/s]


Epoch 1 Metrics:
Train Loss: 1.1872, Val Loss: 0.9445, Train Acc: 46.01%, Val Acc: 51.24%


Validation: 0it [00:00, ?it/s]


Epoch 1 Metrics:
Train Loss: 1.1872, Val Loss: 0.8999, Train Acc: 46.01%, Val Acc: 49.25%


Metric val_loss improved by 0.113 >= min_delta = 1e-05. New best score: 0.900


Validation: 0it [00:00, ?it/s]


Epoch 2 Metrics:
Train Loss: 0.9156, Val Loss: 0.8782, Train Acc: 54.82%, Val Acc: 51.71%


Validation: 0it [00:00, ?it/s]


Epoch 2 Metrics:
Train Loss: 0.9156, Val Loss: 0.8578, Train Acc: 54.82%, Val Acc: 50.61%


Validation: 0it [00:00, ?it/s]


Epoch 2 Metrics:
Train Loss: 0.9156, Val Loss: 0.8510, Train Acc: 54.82%, Val Acc: 50.39%


Metric val_loss improved by 0.049 >= min_delta = 1e-05. New best score: 0.851


Validation: 0it [00:00, ?it/s]


Epoch 3 Metrics:
Train Loss: 0.7825, Val Loss: 0.8499, Train Acc: 59.42%, Val Acc: 49.45%


Validation: 0it [00:00, ?it/s]


Epoch 3 Metrics:
Train Loss: 0.7825, Val Loss: 0.8499, Train Acc: 59.42%, Val Acc: 51.41%


Validation: 0it [00:00, ?it/s]


Epoch 3 Metrics:
Train Loss: 0.7825, Val Loss: 0.8546, Train Acc: 59.42%, Val Acc: 49.27%


Monitored metric val_loss did not improve in the last 1 records. Best score: 0.851. Signaling Trainer to stop.
