### Using models module in torchvision

In [1]:
import torch
import torch.nn as nn
from torchvision import models

def create_resnet_model(model_name, num_classes=10, weights=None):
    model = None
    if model_name == 'resnet50':
        model = models.resnet50(weights=weights)
    elif model_name == 'resnext50_32x4d':
        model = models.resnext50_32x4d(weights=weights)
    
    num_in_features = model.fc.in_features
    model.fc = nn.Linear(in_features=num_in_features, out_features=num_classes)

    return model
    
model = create_resnet_model('resnet50', num_classes=2, weights='DEFAULT') #resnet50, resnext50_32x4d

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 211MB/s]


### Download cat and dog dataset and create meta data

In [2]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split

def create_cnd_meta_df(file_dir):
    paths = []  # list of image file paths
    dataset_gubuns = []  # train/test split
    label_gubuns = []  # dog and cat
    # Use os.walk() to traverse all subdirectories under a specific directory.
    # Under the cat-and-dog directory, all files with the jpg extension are image files.
    # The cat-and-dog directory contains /train/ and /test/ subdirectories (holding training and test images).
    for dirname, _, filenames in os.walk(file_dir):
        for filename in filenames:
            # Non-image files may also exist in this directory.
            if '.jpg' in filename:
                # Assign the absolute path of the file to the file_path variable.
                file_path = dirname + '/' + filename
                paths.append(file_path)
                # If the absolute path contains training_set or test_set, classify dataset as 'train' or 'test'.
                if '/training_set/' in file_path:
                    dataset_gubuns.append('train')  
                elif '/test_set/' in file_path:
                    dataset_gubuns.append('test')
                else: 
                    dataset_gubuns.append('N/A')
                
                # If the absolute path contains dogs, the file is a dog image; if it contains cats, it's a cat image.
                if 'dogs' in file_path:
                    label_gubuns.append('DOG')
                elif 'cats' in file_path:
                    label_gubuns.append('CAT')
                else: 
                    label_gubuns.append('N/A')
    # Create a DataFrame containing metadata.
    data_df = pd.DataFrame({'path': paths, 
                            'dataset': dataset_gubuns, 
                            'label': label_gubuns})
    # Convert target values to 0 and 1
    label_mapping = {'DOG': 0, 'CAT': 1}
    data_df['target'] = data_df['label'].map(label_mapping)

    return data_df


### create dataloader

In [3]:
data_df = create_cnd_meta_df(file_dir='/kaggle/input/')
print(data_df.shape)

(10028, 4)


In [4]:
train_df = data_df[data_df['dataset']=='train']
test_df = data_df[data_df['dataset']=='test']

train_df_temp, _ = train_test_split(train_df, test_size=0.5, stratify=train_df['target'], random_state=2025)

tr_df, val_df = train_test_split(train_df_temp, test_size=0.5, stratify=train_df_temp['target'], random_state=2025)

print(data_df.shape, train_df.shape, tr_df.shape, val_df.shape)

(10028, 4) (8005, 4) (2001, 4) (2001, 4)


In [5]:
import torch
from torch.utils.data import Dataset
from PIL import Image

class CnD_Dataset(Dataset):
    # Takes in image file list, target list, transforms, and other arguments needed for processing images and targets
    def __init__(self, image_paths, targets=None, transform=None):
        self.image_paths = image_paths
        self.targets = targets
        self.transform = transform
    
    # Returns the total number of items
    def __len__(self):
        return len(self.image_paths)
        
    # Returns a single image and label as tensors based on the specified index
    def __getitem__(self, idx):    
        # Load the image using PIL and return a PIL Image object
        pil_image = Image.open(self.image_paths[idx])
        # Typically transform is not None (at least Tensor conversion is applied)
        image = self.transform(pil_image)

        if self.targets is not None:
            # Convert the individual target value to a tensor
            target = torch.tensor(self.targets[idx])
            return image, target
        # For test data, targets may not be provided, so handle this case
        else:
            return image


In [6]:
from torch.utils.data import DataLoader
from torchvision import transforms as T

BATCH_SIZE = 16
IMG_SIZE = 224
IMG_MEANS = [0.485, 0.456, 0.406] 
IMG_STD = [0.229, 0.224, 0.225]

def create_tr_val_loader(tr_df, val_df, transform):
    tr_dataset = CnD_Dataset(image_paths=tr_df['path'].to_list(), 
                            targets=tr_df['target'].to_list(), transform=transform)
    val_dataset = CnD_Dataset(image_paths=val_df['path'].to_list(), 
                            targets=val_df['target'].to_list(), transform=transform)
    tr_loader = DataLoader(tr_dataset, batch_size = BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=2*BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

    return tr_loader, val_loader

transform_01 = T.Compose([
            T.Resize(size=(IMG_SIZE, IMG_SIZE)),
            T.ToTensor(), T.Normalize(mean=IMG_MEANS, std=IMG_STD)
])

tr_loader, val_loader = create_tr_val_loader(tr_df=tr_df, val_df=val_df, transform=transform_01)

### Trainer Class Creation – Apply ModelCheckpoint and Early Stopping Logic

* Add `checkpoint_dir` and `early_patience` arguments to the constructor.
* Apply ModelCheckpoint and Early Stopping logic to the `fit()` method.
* In PyTorch, saving a model:

  * `torch.save(model, 'filename.pt')`: Saves both the model architecture (including optimizer) and parameters (weights).
  * `torch.save(model.state_dict(), 'model_state_filename.pt')`: Saves only the parameters. More commonly used.
* Loading a PyTorch model:

  * `model = torch.load('filename.pt')`: Loads the entire model (architecture and parameters together).
  * `model.load_state_dict(torch.load('model_state_filename.pt'))`


In [7]:
from tqdm import tqdm
import torch.nn.functional as F

class Trainer_01:
    def __init__(self, model, loss_fn, optimizer, train_loader, val_loader, scheduler=None, 
                 checkpoint_dir='/kaggle/working/checkpoints', early_patience=5, device=None):
        self.model = model.to(device)
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        # add scheduler
        self.scheduler = scheduler
        self.device = device
        # add current learning rate variable
        self.current_lr = self.optimizer.param_groups[0]['lr']
        # set variables for checkpoint and early stopping
        self.checkpoint_dir = checkpoint_dir
        self.early_patience = early_patience
        # initialize best loss as infinity since smaller loss is better
        self.best_val_loss = float('inf')
        self.early_stopping_counter = 0

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

    def train_epoch(self, epoch):
        self.model.train()

        # calculate running average loss
        accu_loss = 0.0
        running_avg_loss = 0.0
        # accuracy, total count and accumulated correct count
        num_total = 0.0
        accu_num_correct = 0.0
        accuracy = 0.0
        # visualize training loop progress with tqdm
        with tqdm(total=len(self.train_loader), desc=f"Epoch {epoch+1} [Training..]", leave=True) as progress_bar:
            for batch_idx, (inputs, targets) in enumerate(self.train_loader):
                # must use to(self.device), not to(device)
                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                # forward pass
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, targets)

                # backward pass
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # accumulate loss for each batch and calculate running average
                accu_loss += loss.item()
                running_avg_loss = accu_loss /(batch_idx + 1)

                # calculate accuracy metric
                # count matches between predicted class values and targets
                num_correct = (outputs.argmax(-1) == targets).sum().item()
                # calculate accuracy using total accumulated samples and correct predictions
                num_total += inputs.shape[0]
                accu_num_correct += num_correct
                accuracy = accu_num_correct / num_total

                # update tqdm progress bar with running average loss and accuracy
                progress_bar.update(1)
                if batch_idx % 20 == 0 or (batch_idx + 1) == progress_bar.total:  # update every 20 batches or last batch
                    progress_bar.set_postfix({"Loss": running_avg_loss,
                                              "Accuracy": accuracy})

        if (self.scheduler is not None) and (not isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)):
            self.scheduler.step()
            self.current_lr = self.scheduler.get_last_lr()[0]
        
        return running_avg_loss, accuracy

    def validate_epoch(self, epoch):
        if not self.val_loader:
            return None

        self.model.eval()

        # calculate running average loss
        accu_loss = 0
        running_avg_loss = 0
        # accuracy, total count and accumulated correct count
        num_total = 0.0
        accu_num_correct = 0.0
        current_lr = self.optimizer.param_groups[0]['lr']
        with tqdm(total=len(self.val_loader), desc=f"Epoch {epoch+1} [Validating]", leave=True) as progress_bar:
            with torch.no_grad():
                for batch_idx, (inputs, targets) in enumerate(self.val_loader):
                    inputs = inputs.to(self.device)
                    targets = targets.to(self.device)

                    outputs = self.model(inputs)

                    loss = self.loss_fn(outputs, targets)
                    # accumulate loss for each batch and calculate running average
                    accu_loss += loss.item()
                    running_avg_loss = accu_loss /(batch_idx + 1)

                    # calculate accuracy metric
                    # count matches between predicted class values and targets
                    num_correct = (outputs.argmax(-1) == targets).sum().item()
                    # calculate accuracy using total accumulated samples and correct predictions
                    num_total += inputs.shape[0]
                    accu_num_correct += num_correct
                    accuracy = accu_num_correct / num_total
                    
                    # update tqdm progress bar with running average loss and accuracy
                    progress_bar.update(1)
                    if batch_idx % 40 == 0 or (batch_idx + 1) == progress_bar.total:  # update every 40 batches or last batch
                        progress_bar.set_postfix({"Loss": running_avg_loss,
                                                  "Accuracy":accuracy})
        # pass epoch-level validation loss to scheduler
        if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            self.scheduler.step(running_avg_loss)
            self.current_lr = self.scheduler.get_last_lr()[0]

        return running_avg_loss, accuracy

    def fit(self, epochs):
        # create history dict to record train/validation results per epoch, including learning rate
        history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [], 'lr': []}
        for epoch in range(epochs):
            train_loss, train_acc = self.train_epoch(epoch)
            val_loss, val_acc = self.validate_epoch(epoch)
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f} Train Accuracy: {train_acc:.4f}",
                  f", Val Loss: {val_loss:.4f} Val Accuracy: {val_acc:.4f}" if val_loss is not None else "",
                  f", Current lr:{self.current_lr:.6f}")
            # record train/validation results per epoch, including learning rate
            history['train_loss'].append(train_loss); history['train_acc'].append(train_acc)
            history['val_loss'].append(val_loss); history['val_acc'].append(val_acc)
            history['lr'].append(self.current_lr)

            # if current loss improves compared to best_val_loss
            if val_loss < self.best_val_loss:
                self.best_val_loss=val_loss
                # save model weights
                self.save_checkpoint(epoch, val_loss)
                # reset early stopping counter
                self.early_stopping_counter = 0
            else:
                self. early_stopping_counter += 1
                if(self.early_stopping_counter >= self.early_patience):
                    print("Early Stopping happens and training stops")
                    break
                                
        return history

    # save model to file when performance improves compared to previous epoch
    def save_checkpoint(self, epoch, val_loss):
        checkpoint_path = os.path.join(self.checkpoint_dir, f'checkpoint_epoch_{epoch+1}_loss_{val_loss:.4f}.pt')
        torch.save(self.model.state_dict(), checkpoint_path)
        print(f"Saved model checkpoint at {checkpoint_path}")

    # return the trained model
    def get_trained_model(self):
        return self.model


#### Train using the modified Trainer class


In [8]:
import torch
import torch.nn as nn
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

model = create_resnet_model('resnet50', num_classes=2, weights='DEFAULT')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = Adam(model.parameters(), lr=0.001) # model.fc.parameters(), weight_decay=0.9
loss_fn = nn.CrossEntropyLoss()
# Increase patience to 5.
scheduler = ReduceLROnPlateau(
            optimizer=optimizer, mode='min', factor=0.5, patience=5, threshold=0.01, min_lr=0.00001)

# Input the directory where checkpoint files will be saved and the early-stop patience value.
trainer = Trainer_01(model=model, loss_fn=loss_fn, optimizer=optimizer,
                  train_loader=tr_loader, val_loader=val_loader, scheduler=scheduler, 
                  checkpoint_dir='checkpoints', early_patience=3,
                  device=device)
# Training and evaluation
history = trainer.fit(30)


Epoch 1 [Training..]: 100%|██████████| 126/126 [00:13<00:00,  9.52it/s, Loss=0.332, Accuracy=0.873]
Epoch 1 [Validating]: 100%|██████████| 63/63 [00:05<00:00, 11.23it/s, Loss=0.48, Accuracy=0.846]


Epoch 1/30, Train Loss: 0.3315 Train Accuracy: 0.8726 , Val Loss: 0.4799 Val Accuracy: 0.8456 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_1_loss_0.4799.pt


Epoch 2 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.202, Accuracy=0.92]
Epoch 2 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.70it/s, Loss=0.826, Accuracy=0.812]


Epoch 2/30, Train Loss: 0.2020 Train Accuracy: 0.9195 , Val Loss: 0.8264 Val Accuracy: 0.8116 , Current lr:0.001000


Epoch 3 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.59it/s, Loss=0.173, Accuracy=0.94]
Epoch 3 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.93it/s, Loss=0.278, Accuracy=0.9]


Epoch 3/30, Train Loss: 0.1726 Train Accuracy: 0.9395 , Val Loss: 0.2779 Val Accuracy: 0.8996 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_3_loss_0.2779.pt


Epoch 4 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.187, Accuracy=0.929]
Epoch 4 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.59it/s, Loss=0.258, Accuracy=0.906]


Epoch 4/30, Train Loss: 0.1870 Train Accuracy: 0.9285 , Val Loss: 0.2582 Val Accuracy: 0.9055 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_4_loss_0.2582.pt


Epoch 5 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.214, Accuracy=0.919]
Epoch 5 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 15.03it/s, Loss=0.192, Accuracy=0.922]


Epoch 5/30, Train Loss: 0.2143 Train Accuracy: 0.9185 , Val Loss: 0.1924 Val Accuracy: 0.9220 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_5_loss_0.1924.pt


Epoch 6 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.153, Accuracy=0.948]
Epoch 6 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.95it/s, Loss=0.193, Accuracy=0.919]


Epoch 6/30, Train Loss: 0.1534 Train Accuracy: 0.9480 , Val Loss: 0.1928 Val Accuracy: 0.9185 , Current lr:0.001000


Epoch 7 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.61it/s, Loss=0.186, Accuracy=0.931]
Epoch 7 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.49it/s, Loss=0.259, Accuracy=0.897]


Epoch 7/30, Train Loss: 0.1858 Train Accuracy: 0.9305 , Val Loss: 0.2587 Val Accuracy: 0.8966 , Current lr:0.001000


Epoch 8 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.60it/s, Loss=0.0891, Accuracy=0.972]
Epoch 8 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 15.00it/s, Loss=0.197, Accuracy=0.932]

Epoch 8/30, Train Loss: 0.0891 Train Accuracy: 0.9715 , Val Loss: 0.1970 Val Accuracy: 0.9320 , Current lr:0.001000
Early Stopping happens and training stops





#### Predict

In [9]:
class Predictor:
    def __init__(self, model, device):
        self.model = model.to(device)
        self.device = device

    def evaluate(self, loader):
        self.model.eval()
        eval_metric = 0.0
        num_total = 0.0
        accu_num_correct = 0.0

        with tqdm(total=len(loader), desc=f"[Evaluating]", leave=True) as progress_bar:
            with torch.no_grad():
                for batch_idx, (inputs, targets) in enumerate(loader):
                    inputs = inputs.to(self.device)
                    targets = targets.to(self.device)
                    pred = self.model(inputs)

                    num_correct = (pred.argmax(-1) == targets).sum().item()
                    num_total += inputs.shape[0]
                    accu_num_correct += num_correct
                    eval_metric = accu_num_correct / num_total

                    progress_bar.update(1)
                    if batch_idx % 20 == 0 or (batch_idx + 1) == progress_bar.total:
                        progress_bar.set_postfix({"Accuracy": eval_metric})
        
        return eval_metric

    def predict_proba(self, inputs):
        self.model.eval()
        with torch.no_grad():
            inputs = inputs.to(self.device)
            outputs = self.model(inputs)
            pred_proba = F.softmax(outputs, dim=-1) 
        return pred_proba

    def predict(self, inputs):
        pred_proba = self.predict_proba(inputs)
        pred_class = torch.argmax(pred_proba, dim=-1)

        return pred_class

In [10]:
test_image_paths = test_df['path'].to_list()
test_targets = test_df['target'].to_list()

IMG_SIZE=224
test_transform = T.Compose([
                        T.Resize(size=(IMG_SIZE, IMG_SIZE)),
                        T.ToTensor(), 
                        T.Normalize(mean=[0.485, 0.456, 0.406], 
                                    std=[0.229, 0.224, 0.225])
])

test_dataset = CnD_Dataset(image_paths=test_image_paths, 
                            targets=test_targets, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

trained_model = trainer.get_trained_model()

predictor = Predictor(model=trained_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

[Evaluating]: 100%|██████████| 64/64 [00:05<00:00, 11.49it/s, Accuracy=0.936]

test dataset evaluation:0.9357





### Checkpoint File Verification

* Check the files generated in the `/kaggle/working/checkpoints` directory.


In [11]:
!ls /kaggle/working/checkpoints

checkpoint_epoch_1_loss_0.4799.pt  checkpoint_epoch_4_loss_0.2582.pt
checkpoint_epoch_3_loss_0.2779.pt  checkpoint_epoch_5_loss_0.1924.pt


In [12]:
import os
import glob
import re
import torch

ckpt_dir = "/kaggle/working/checkpoints"

# Get all checkpoint files from the directory
checkpoint_files = glob.glob(os.path.join(ckpt_dir, "checkpoint_epoch_*.pt"))

# Function to extract the epoch number from the filename
def extract_epoch_num(filename):
    match = re.search(r"epoch_(\d+)", filename)
    return int(match.group(1)) if match else -1

# Select the checkpoint with the highest epoch number
latest_ckpt = max(checkpoint_files, key=extract_epoch_num)
print("Loading checkpoint:", latest_ckpt)

# Load state_dict from the latest checkpoint
state_dict = torch.load(latest_ckpt, weights_only=True)

# Create a model without pretrained weights
new_model = create_resnet_model('resnet50', num_classes=2, weights=None)

# Load the parameters from checkpoint into the model
new_model.load_state_dict(state_dict)


Loading checkpoint: /kaggle/working/checkpoints/checkpoint_epoch_5_loss_0.1924.pt


<All keys matched successfully>

In [13]:
predictor = Predictor(model=new_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

[Evaluating]: 100%|██████████| 64/64 [00:04<00:00, 14.91it/s, Accuracy=0.913]

test dataset evaluation:0.9135





### Create ModelCheckPoint class

In [14]:
import os
import torch

class ModelCheckpoint:
    def __init__(self, checkpoint_dir='checkpoints', monitor='val_loss', mode='min', save_interval=1, verbose=1):
        self.checkpoint_dir = checkpoint_dir
        self.monitor = monitor
        self.mode = mode
        self.best_value = float('inf') if mode == 'min' else -float('inf')
        self.verbose = verbose
        self.save_interval = save_interval
        self._make_checkpoint_dir_unless()

    def _make_checkpoint_dir_unless(self):
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
    
    # Depending on the mode, check if the metric value has improved compared to the previous epoch and return True/False
    def is_improvement(self, value):
        if self.mode == 'min':
            return value < self.best_value
        else:
            return value > self.best_value

    # Update self.best_value, performed only if is_improvement() returns True
    def update_best_value(self, value):
        self.best_value = value

    def save(self, model, epoch, value):
        if self.save_interval == 1:
            if self.is_improvement(value):
                self._checkpoint_save(model, epoch, value)
                self.update_best_value(value)
            
        elif self.save_interval > 1:
            if (epoch + 1) % self.save_interval == 0:
                self._checkpoint_save(model, epoch, value)
            
    def _checkpoint_save(self, model, epoch, value):
        checkpoint_path = os.path.join(self.checkpoint_dir, 
                                       f'checkpoint_epoch_{epoch+1}_{self.monitor}_{value:.4f}.pt')
        torch.save(model.state_dict(), checkpoint_path)
        if self.verbose:
            print(f"Saved model checkpoint at {checkpoint_path}")


#### Modify the Trainer class to apply the ModelCheckpoint


In [15]:
from tqdm import tqdm
import torch.nn.functional as F

class Trainer_02:
    def __init__(self, model, loss_fn, optimizer, train_loader, val_loader, scheduler=None, 
                 checkpoint_cb=None, device=None):
        self.model = model.to(device)
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        self.current_lr = self.optimizer.param_groups[0]['lr']
        self.checkpoint_cb = checkpoint_cb
        
    def train_epoch(self, epoch):
        self.model.train()

        accu_loss = 0.0
        running_avg_loss = 0.0
        num_total = 0.0
        accu_num_correct = 0.0
        accuracy = 0.0
        with tqdm(total=len(self.train_loader), desc=f"Epoch {epoch+1} [Training..]", leave=True) as progress_bar:
            for batch_idx, (inputs, targets) in enumerate(self.train_loader):
                # 반드시 to(self.device). to(device) 아님.
                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                # Forward pass
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, targets)

                # Backward pass
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                accu_loss += loss.item()
                running_avg_loss = accu_loss /(batch_idx + 1)

                num_correct = (outputs.argmax(-1) == targets).sum().item()
                num_total += inputs.shape[0]
                accu_num_correct += num_correct
                accuracy = accu_num_correct / num_total

                progress_bar.update(1)
                if batch_idx % 20 == 0 or (batch_idx + 1) == progress_bar.total:  
                    progress_bar.set_postfix({"Loss": running_avg_loss,
                                              "Accuracy": accuracy})

        if (self.scheduler is not None) and (not isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)):
            self.scheduler.step()
            self.current_lr = self.scheduler.get_last_lr()[0]
        
        return running_avg_loss, accuracy

    def validate_epoch(self, epoch):
        if not self.val_loader:
            return None

        self.model.eval()

        accu_loss = 0
        running_avg_loss = 0
        num_total = 0.0
        accu_num_correct = 0.0
        accuracy = 0.0
        current_lr = self.optimizer.param_groups[0]['lr']
        with tqdm(total=len(self.val_loader), desc=f"Epoch {epoch+1} [Validating]", leave=True) as progress_bar:
            with torch.no_grad():
                for batch_idx, (inputs, targets) in enumerate(self.val_loader):
                    inputs = inputs.to(self.device)
                    targets = targets.to(self.device)

                    outputs = self.model(inputs)

                    loss = self.loss_fn(outputs, targets)
                    accu_loss += loss.item()
                    running_avg_loss = accu_loss /(batch_idx + 1)

                    num_correct = (outputs.argmax(-1) == targets).sum().item()
                    num_total += inputs.shape[0]
                    accu_num_correct += num_correct
                    accuracy = accu_num_correct / num_total
                    
                    progress_bar.update(1)
                    if batch_idx % 20 == 0 or (batch_idx + 1) == progress_bar.total:  
                        progress_bar.set_postfix({"Loss": running_avg_loss,
                                                  "Accuracy":accuracy})
        # scheduler에 검증 데이터 기반에서 epoch레벨로 계산된 loss를 입력해줌.
        if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            self.scheduler.step(running_avg_loss)
            self.current_lr = self.scheduler.get_last_lr()[0]

        return running_avg_loss, accuracy

    def fit(self, epochs):
        history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [], 'lr': []}
        for epoch in range(epochs):
            train_loss, train_acc = self.train_epoch(epoch)
            val_loss, val_acc = self.validate_epoch(epoch)
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f} Train Accuracy: {train_acc:.4f}",
                  f", Val Loss: {val_loss:.4f} Val Accuracy: {val_acc:.4f}" if val_loss is not None else "",
                  f", Current lr:{self.current_lr:.6f}")
            history['train_loss'].append(train_loss); history['train_acc'].append(train_acc)
            history['val_loss'].append(val_loss); history['val_acc'].append(val_acc)
            history['lr'].append(self.current_lr)

            # if modelcheckpoint is set as an argument, call save() method based on moitor value
            if self.checkpoint_cb:
                if self.checkpoint_cb.monitor=='val_loss':    
                    self.checkpoint_cb.save(self.model, epoch, val_loss)
                elif self.checkpoint_cb.monitor == 'val_acc':
                    self.checkpoint_cb.save(self.model, epoch, val_acc)
                                
        return history

    # Save the model as a file when performance is improved over previous epochs. 
    # def save_checkpoint(self, epoch, val_loss):
    #     checkpoint_path = os.path.join(self.checkpoint_dir, f'checkpoint_epoch_{epoch+1}_loss_{val_loss:.4f}.pt')
    #     torch.save(self.model.state_dict(), checkpoint_path)
    #     print(f"Saved model checkpoint at {checkpoint_path}")

    
    def get_trained_model(self):
        return self.model

In [16]:
import torch
import torch.nn as nn
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

model = create_resnet_model('resnet50', num_classes=2, weights='DEFAULT')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = Adam(model.parameters(), lr=0.001) # model.fc.parameters(), weight_decay=0.9
loss_fn = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(
            optimizer=optimizer, mode='min', factor=0.5, patience=5, threshold=0.01, min_lr=0.00001)

# ModelCheckpoint
checkpoint_cb = ModelCheckpoint('checkpoints', monitor='val_loss', mode='min', save_interval=1, verbose=1)
# checkpoint_cb = ModelCheckpoint('checkpoints', monitor='val_acc', mode='max', save_interval=1, verbose=1)

trainer = Trainer_02(model=model, loss_fn=loss_fn, optimizer=optimizer,
                  train_loader=tr_loader, val_loader=val_loader, scheduler=scheduler, 
                  checkpoint_cb = checkpoint_cb,
                  device=device)
# 학습 및 평가. 
history = trainer.fit(10)

Epoch 1 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.55it/s, Loss=0.293, Accuracy=0.885]
Epoch 1 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.68it/s, Loss=0.397, Accuracy=0.837]


Epoch 1/10, Train Loss: 0.2931 Train Accuracy: 0.8846 , Val Loss: 0.3974 Val Accuracy: 0.8371 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_1_val_loss_0.3974.pt


Epoch 2 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.56it/s, Loss=0.266, Accuracy=0.888]
Epoch 2 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.06it/s, Loss=0.177, Accuracy=0.932]


Epoch 2/10, Train Loss: 0.2663 Train Accuracy: 0.8876 , Val Loss: 0.1767 Val Accuracy: 0.9315 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_2_val_loss_0.1767.pt


Epoch 3 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.166, Accuracy=0.938]
Epoch 3 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.61it/s, Loss=0.214, Accuracy=0.914]


Epoch 3/10, Train Loss: 0.1662 Train Accuracy: 0.9380 , Val Loss: 0.2143 Val Accuracy: 0.9135 , Current lr:0.001000


Epoch 4 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.56it/s, Loss=0.138, Accuracy=0.952]
Epoch 4 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.77it/s, Loss=0.231, Accuracy=0.906]


Epoch 4/10, Train Loss: 0.1379 Train Accuracy: 0.9515 , Val Loss: 0.2314 Val Accuracy: 0.9055 , Current lr:0.001000


Epoch 5 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.56it/s, Loss=0.125, Accuracy=0.954]
Epoch 5 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.28it/s, Loss=0.227, Accuracy=0.925]


Epoch 5/10, Train Loss: 0.1250 Train Accuracy: 0.9535 , Val Loss: 0.2270 Val Accuracy: 0.9245 , Current lr:0.001000


Epoch 6 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.119, Accuracy=0.958]
Epoch 6 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.64it/s, Loss=0.176, Accuracy=0.93]


Epoch 6/10, Train Loss: 0.1192 Train Accuracy: 0.9575 , Val Loss: 0.1760 Val Accuracy: 0.9300 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_6_val_loss_0.1760.pt


Epoch 7 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.137, Accuracy=0.954]
Epoch 7 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.60it/s, Loss=0.17, Accuracy=0.935]


Epoch 7/10, Train Loss: 0.1371 Train Accuracy: 0.9540 , Val Loss: 0.1696 Val Accuracy: 0.9350 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_7_val_loss_0.1696.pt


Epoch 8 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.318, Accuracy=0.867]
Epoch 8 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.40it/s, Loss=0.238, Accuracy=0.905]


Epoch 8/10, Train Loss: 0.3178 Train Accuracy: 0.8671 , Val Loss: 0.2379 Val Accuracy: 0.9045 , Current lr:0.001000


Epoch 9 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.158, Accuracy=0.951]
Epoch 9 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.42it/s, Loss=0.204, Accuracy=0.923]


Epoch 9/10, Train Loss: 0.1579 Train Accuracy: 0.9505 , Val Loss: 0.2039 Val Accuracy: 0.9225 , Current lr:0.001000


Epoch 10 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.164, Accuracy=0.938]
Epoch 10 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.64it/s, Loss=0.158, Accuracy=0.939]


Epoch 10/10, Train Loss: 0.1644 Train Accuracy: 0.9380 , Val Loss: 0.1584 Val Accuracy: 0.9385 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_10_val_loss_0.1584.pt


### Create EarlyStopping class

In [17]:
class EarlyStopping:
    def __init__(self, monitor='val_loss', mode='min', early_patience=5, verbose=1):
        self.monitor = monitor
        self.mode = mode
        self.early_patience = early_patience
        self.verbose = verbose
        self.best_value = float('inf') if mode == 'min' else -float('inf')
        self.counter = 0

    def is_improvement(self, value):
        if self.mode == 'min':
            return value < self.best_value
        else:
            return value > self.best_value

    def check_early_stop(self, value):
        is_early_stopped = False
        
        if self.is_improvement(value):
            self.best_value = value
            self.counter = 0
            is_early_stopped =False
        else:
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter}/{self.early_patience}")
            if self.counter >= self.early_patience:
                is_early_stopped = True
                if self.verbose:
                    print("Early stopping happens and train stops")
        
        return is_early_stopped
        

### Class with EarlyStopping and ModelCheckPoint

In [18]:
from tqdm import tqdm
import torch.nn.functional as F

class Trainer:
    def __init__(self, model, loss_fn, optimizer, train_loader, val_loader, scheduler=None, 
                 callbacks=None, device=None):
        self.model = model.to(device)
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        # scheduler added
        self.scheduler = scheduler
        self.device = device
        # add current learning rate variable
        self.current_lr = self.optimizer.param_groups[0]['lr']
        # receive checkpoint and early stopping classes as a list
        self.callbacks = callbacks
        
    def train_epoch(self, epoch):
        self.model.train()

        # calculate running average loss
        accu_loss = 0.0
        running_avg_loss = 0.0
        # accuracy, total count, and cumulative correct count for accuracy calculation
        num_total = 0.0
        accu_num_correct = 0.0
        accuracy = 0.0
        # visualize training loop progress in real-time with tqdm
        with tqdm(total=len(self.train_loader), desc=f"Epoch {epoch+1} [Training..]", leave=True) as progress_bar:
            for batch_idx, (inputs, targets) in enumerate(self.train_loader):
                # must use to(self.device), not to(device)
                inputs = inputs.to(self.device)
                targets = targets.to(self.device)

                # forward pass
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, targets)

                # backward pass
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # calculate cumulative loss for each batch and running average loss
                accu_loss += loss.item()
                running_avg_loss = accu_loss /(batch_idx + 1)

                # calculate accuracy metric
                # calculate number of matches between predicted class and targets
                num_correct = (outputs.argmax(-1) == targets).sum().item()
                # calculate accuracy from cumulative total count and cumulative correct count
                num_total += inputs.shape[0]
                accu_num_correct += num_correct
                accuracy = accu_num_correct / num_total

                # update tqdm progress bar with progress, running average loss, and accuracy
                progress_bar.update(1)
                if batch_idx % 20 == 0 or (batch_idx + 1) == progress_bar.total:  # update every 20 batches or at the last batch
                    progress_bar.set_postfix({"Loss": running_avg_loss,
                                              "Accuracy": accuracy})

        if (self.scheduler is not None) and (not isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)):
            self.scheduler.step()
            self.current_lr = self.scheduler.get_last_lr()[0]
        
        return running_avg_loss, accuracy

    def validate_epoch(self, epoch):
        if not self.val_loader:
            return None

        self.model.eval()

        # calculate running average loss
        accu_loss = 0
        running_avg_loss = 0
        # accuracy, total count, and cumulative correct count for accuracy calculation
        num_total = 0.0
        accu_num_correct = 0.0
        accuracy = 0.0
        current_lr = self.optimizer.param_groups[0]['lr']
        with tqdm(total=len(self.val_loader), desc=f"Epoch {epoch+1} [Validating]", leave=True) as progress_bar:
            with torch.no_grad():
                for batch_idx, (inputs, targets) in enumerate(self.val_loader):
                    inputs = inputs.to(self.device)
                    targets = targets.to(self.device)

                    outputs = self.model(inputs)

                    loss = self.loss_fn(outputs, targets)
                    # calculate cumulative loss for each batch and running average loss
                    accu_loss += loss.item()
                    running_avg_loss = accu_loss /(batch_idx + 1)

                    # calculate accuracy metric
                    num_correct = (outputs.argmax(-1) == targets).sum().item()
                    # calculate accuracy from cumulative total count and cumulative correct count
                    num_total += inputs.shape[0]
                    accu_num_correct += num_correct
                    accuracy = accu_num_correct / num_total
                    
                    # update tqdm progress bar with progress, running average loss, and accuracy
                    progress_bar.update(1)
                    if batch_idx % 20 == 0 or (batch_idx + 1) == progress_bar.total:  # update every 20 batches or at the last batch
                        progress_bar.set_postfix({"Loss": running_avg_loss,
                                                  "Accuracy": accuracy})
        # input validation loss to scheduler at the epoch level
        if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            self.scheduler.step(running_avg_loss)
            self.current_lr = self.scheduler.get_last_lr()[0]

        return running_avg_loss, accuracy

    def fit(self, epochs):
        # create history dict to record training/validation results each epoch, including learning rate
        history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [], 'lr': []}
        for epoch in range(epochs):
            train_loss, train_acc = self.train_epoch(epoch)
            val_loss, val_acc = self.validate_epoch(epoch)
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f} Train Accuracy: {train_acc:.4f}",
                  f", Val Loss: {val_loss:.4f} Val Accuracy: {val_acc:.4f}" if val_loss is not None else "",
                  f", Current lr:{self.current_lr:.6f}")
            # record training/validation results each epoch
            history['train_loss'].append(train_loss); history['train_acc'].append(train_acc)
            history['val_loss'].append(val_loss); history['val_acc'].append(val_acc)
            history['lr'].append(self.current_lr)

            # if callbacks are provided as arguments, execute them; if early stop triggered, break loop
            if self.callbacks:
                is_epoch_loop_break = self._execute_callbacks(self.callbacks, self.model, epoch, val_loss, val_acc)
                if is_epoch_loop_break:
                    break
                                
        return history

    # execute ModelCheckpoint and EarlyStopping from callbacks list
    # return is_early_stopped if EarlyStopping decides to stop
    def _execute_callbacks(self, callbacks, model, epoch, val_loss, val_acc):
        is_early_stopped = False
        
        for callback in self.callbacks:
            if isinstance(callback, ModelCheckpoint):
                if callback.monitor == 'val_loss':    
                    callback.save(model, epoch, val_loss)
                elif callback.monitor == 'val_acc':
                    callback.save(model, epoch, val_acc)
            if isinstance(callback, EarlyStopping):
                if callback.monitor == 'val_loss':
                    is_early_stopped = callback.check_early_stop(val_loss)
                if callback.monitor == 'val_acc':
                    is_early_stopped = callback.check_early_stop(val_acc)
                
        return is_early_stopped

    # return trained model
    def get_trained_model(self):
        return self.model


In [19]:
import torch
import torch.nn as nn
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

model = create_resnet_model('resnet50', num_classes=2, weights='DEFAULT')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = Adam(model.parameters(), lr=0.001) # model.fc.parameters(), weight_decay=0.9
loss_fn = nn.CrossEntropyLoss()
  
# scheduler = ReduceLROnPlateau(
#             optimizer=optimizer, mode='min', factor=0.5, patience=5, threshold=0.01, min_lr=0.00001)

checkpoint_cb = ModelCheckpoint('checkpoints', monitor='val_acc', mode='max', save_interval=1, verbose=1)
#checkpoint_cb = ModelCheckpoint('checkpoints', monitor='val_loss', mode='min', save_interval=1, verbose=1)
earlystop_cb = EarlyStopping(monitor='val_acc', mode='max', early_patience=5, verbose=1)
#earlystop_cb = EarlyStopping(monitor='val_loss', mode='min', early_patience=5, verbose=1)


callbacks = [checkpoint_cb, earlystop_cb] # None
trainer = Trainer(model=model, loss_fn=loss_fn, optimizer=optimizer,
                  train_loader=tr_loader, val_loader=val_loader, scheduler=None, 
                  callbacks=callbacks,
                  device=device)
history = trainer.fit(30)

Epoch 1 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.325, Accuracy=0.872]
Epoch 1 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.44it/s, Loss=0.281, Accuracy=0.871]


Epoch 1/30, Train Loss: 0.3248 Train Accuracy: 0.8716 , Val Loss: 0.2812 Val Accuracy: 0.8711 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_1_val_acc_0.8711.pt


Epoch 2 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.227, Accuracy=0.899]
Epoch 2 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.52it/s, Loss=0.295, Accuracy=0.888]


Epoch 2/30, Train Loss: 0.2270 Train Accuracy: 0.8991 , Val Loss: 0.2951 Val Accuracy: 0.8876 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_2_val_acc_0.8876.pt


Epoch 3 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.60it/s, Loss=0.167, Accuracy=0.937]
Epoch 3 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.45it/s, Loss=0.288, Accuracy=0.89]


Epoch 3/30, Train Loss: 0.1666 Train Accuracy: 0.9365 , Val Loss: 0.2884 Val Accuracy: 0.8896 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_3_val_acc_0.8896.pt


Epoch 4 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.165, Accuracy=0.939]
Epoch 4 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.34it/s, Loss=0.341, Accuracy=0.886]


Epoch 4/30, Train Loss: 0.1646 Train Accuracy: 0.9385 , Val Loss: 0.3405 Val Accuracy: 0.8861 , Current lr:0.001000
EarlyStopping counter: 1/5


Epoch 5 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.182, Accuracy=0.936]
Epoch 5 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.66it/s, Loss=0.337, Accuracy=0.888]


Epoch 5/30, Train Loss: 0.1818 Train Accuracy: 0.9355 , Val Loss: 0.3374 Val Accuracy: 0.8876 , Current lr:0.001000
EarlyStopping counter: 2/5


Epoch 6 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.59it/s, Loss=0.199, Accuracy=0.923]
Epoch 6 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.59it/s, Loss=0.333, Accuracy=0.91]


Epoch 6/30, Train Loss: 0.1988 Train Accuracy: 0.9225 , Val Loss: 0.3330 Val Accuracy: 0.9100 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_6_val_acc_0.9100.pt


Epoch 7 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.139, Accuracy=0.944]
Epoch 7 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.69it/s, Loss=0.495, Accuracy=0.839]


Epoch 7/30, Train Loss: 0.1393 Train Accuracy: 0.9435 , Val Loss: 0.4952 Val Accuracy: 0.8386 , Current lr:0.001000
EarlyStopping counter: 1/5


Epoch 8 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.56it/s, Loss=0.108, Accuracy=0.964]
Epoch 8 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.55it/s, Loss=0.31, Accuracy=0.923]


Epoch 8/30, Train Loss: 0.1082 Train Accuracy: 0.9635 , Val Loss: 0.3096 Val Accuracy: 0.9230 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_8_val_acc_0.9230.pt


Epoch 9 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.56it/s, Loss=0.0603, Accuracy=0.98]
Epoch 9 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.51it/s, Loss=0.34, Accuracy=0.915]


Epoch 9/30, Train Loss: 0.0603 Train Accuracy: 0.9795 , Val Loss: 0.3396 Val Accuracy: 0.9145 , Current lr:0.001000
EarlyStopping counter: 1/5


Epoch 10 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.59it/s, Loss=0.0909, Accuracy=0.969]
Epoch 10 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.57it/s, Loss=0.22, Accuracy=0.927]


Epoch 10/30, Train Loss: 0.0909 Train Accuracy: 0.9685 , Val Loss: 0.2205 Val Accuracy: 0.9270 , Current lr:0.001000
Saved model checkpoint at checkpoints/checkpoint_epoch_10_val_acc_0.9270.pt


Epoch 11 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.56it/s, Loss=0.102, Accuracy=0.969]
Epoch 11 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.54it/s, Loss=0.254, Accuracy=0.925]


Epoch 11/30, Train Loss: 0.1021 Train Accuracy: 0.9690 , Val Loss: 0.2541 Val Accuracy: 0.9245 , Current lr:0.001000
EarlyStopping counter: 1/5


Epoch 12 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.0972, Accuracy=0.965]
Epoch 12 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.57it/s, Loss=1.36, Accuracy=0.842]


Epoch 12/30, Train Loss: 0.0972 Train Accuracy: 0.9650 , Val Loss: 1.3644 Val Accuracy: 0.8421 , Current lr:0.001000
EarlyStopping counter: 2/5


Epoch 13 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.58it/s, Loss=0.0822, Accuracy=0.972]
Epoch 13 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.59it/s, Loss=0.239, Accuracy=0.924]


Epoch 13/30, Train Loss: 0.0822 Train Accuracy: 0.9720 , Val Loss: 0.2391 Val Accuracy: 0.9240 , Current lr:0.001000
EarlyStopping counter: 3/5


Epoch 14 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.56it/s, Loss=0.0414, Accuracy=0.987]
Epoch 14 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.56it/s, Loss=0.219, Accuracy=0.922]


Epoch 14/30, Train Loss: 0.0414 Train Accuracy: 0.9865 , Val Loss: 0.2193 Val Accuracy: 0.9220 , Current lr:0.001000
EarlyStopping counter: 4/5


Epoch 15 [Training..]: 100%|██████████| 126/126 [00:11<00:00, 10.57it/s, Loss=0.0694, Accuracy=0.979]
Epoch 15 [Validating]: 100%|██████████| 63/63 [00:04<00:00, 14.57it/s, Loss=0.334, Accuracy=0.881]

Epoch 15/30, Train Loss: 0.0694 Train Accuracy: 0.9785 , Val Loss: 0.3344 Val Accuracy: 0.8806 , Current lr:0.001000
EarlyStopping counter: 5/5
Early stopping happens and train stops





In [20]:
trained_model = trainer.get_trained_model()

predictor = Predictor(model=trained_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

[Evaluating]: 100%|██████████| 64/64 [00:04<00:00, 14.84it/s, Accuracy=0.893]

test dataset evaluation:0.8932





In [21]:
latest_ckpt = max(checkpoint_files, key=extract_epoch_num)
print("Loading checkpoint:", latest_ckpt)

# Load state_dict from the latest checkpoint
state_dict = torch.load(latest_ckpt, weights_only=True)

# Create a model without pretrained weights
new_model = create_resnet_model('resnet50', num_classes=2, weights=None)

# Load the parameters from checkpoint into the model
new_model.load_state_dict(state_dict)

Loading checkpoint: /kaggle/working/checkpoints/checkpoint_epoch_5_loss_0.1924.pt


<All keys matched successfully>

In [22]:
predictor = Predictor(model=new_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

[Evaluating]: 100%|██████████| 64/64 [00:04<00:00, 14.76it/s, Accuracy=0.913]

test dataset evaluation:0.9135



