# 0. Download Dataset

In [1]:
import sys
sys.path.append('../..')
from utils.gdrive_downloader import GDriveDownloader
from utils.memory_tracker import MemoryTracker, safe_to_device

root_dir = './data/scenes_cls'

In [2]:
gdrive_downloader = GDriveDownloader(cache_dir='./cache')
success, message = gdrive_downloader.download_and_extract(
    gdrive_url='https://drive.google.com/file/d/1ZUCuYDOe4VVbZvNVZovpquaRQqqJQ639/view?usp=drive_link',
    extract_dir=root_dir,
    keep_zip=False
)
if success:
    print(f'Success: {message}')
else:
    print(f'Error: {message}')

2024-12-17 01:05:35 - INFO - Downloading file to cache/temp_1734372335.zip
Downloading...
From (original): https://drive.google.com/uc?id=1ZUCuYDOe4VVbZvNVZovpquaRQqqJQ639
From (redirected): https://drive.google.com/uc?id=1ZUCuYDOe4VVbZvNVZovpquaRQqqJQ639&confirm=t&uuid=c08d510e-52ee-4780-8ec6-c35ae5c86976
To: /home/jiggle/personal/aio2024/hw/aio-2024-hw/module-6/14_12_2024_M06W02/cache/temp_1734372335.zip
100%|██████████| 255M/255M [00:03<00:00, 83.2MB/s] 
2024-12-17 01:06:37 - INFO - Extracting to data/scenes_cls
Extracting: 100%|██████████| 258M/258M [00:08<00:00, 29.1MB/s] 
2024-12-17 01:06:46 - INFO - Removed archive file: cache/temp_1734372335.zip


Success: Download and extraction completed successfully


In [3]:
import torch
import torch.nn as nn
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

2024-12-17 01:06:47 - INFO - NumExpr defaulting to 12 threads.


In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 59
set_seed(seed)

In [5]:
import torch.backends.cudnn as cudnn
cudnn.benchmark = False
cudnn.deterministic = True
cudnn.enabled = False

# 1. Load data

In [6]:
data_dir = os.path.join(root_dir, 'scenes_classification')
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'val')

classes = {
    label_idx: class_name \
    for label_idx, class_name in enumerate(
        sorted(os.listdir(train_dir))
    )
}
classes

{0: 'buildings',
 1: 'forest',
 2: 'glacier',
 3: 'mountain',
 4: 'sea',
 5: 'street'}

In [7]:
X_train, y_train = [], []
X_test, y_test = [], []

for dataset_path in [train_dir, test_dir]:
    for label_idx, class_name in classes.items():
        class_dir = os.path.join(dataset_path, class_name)
        for img_filename in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_filename)

            if 'train' in dataset_path:
                X_train.append(img_path)
                y_train.append(label_idx)
            else:
                X_test.append(img_path)
                y_test.append(label_idx)


In [8]:
val_size = 0.2
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle
)

# 2. Preprocessing

In [9]:
class ScenesDataset(Dataset):
    def __init__(
        self,
        X, y,
        transform=None
    ):
        self.transform = transform
        self.img_paths = X
        self.labels = y
    
    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        img = Image.open(img_path).convert('RGB')

        if self.transform:
            img = self.transform(img)
        
        return img, self.labels[idx]

In [10]:
def transform(img, img_size=(224, 224)):
    img = img.resize(img_size)
    img = np.array(img)[..., :3]
    img = torch.tensor(img).permute(2, 0, 1).float()
    normalized_img = img / 255.0

    return normalized_img

In [11]:
# Define datasets object

train_dataset = ScenesDataset(
    X_train, y_train,
    transform=transform
)
val_dataset = ScenesDataset(
    X_val, y_val,
    transform=transform
)
test_dataset = ScenesDataset(
    X_test, y_test,
    transform=transform
)

In [12]:
# Define dataloader
train_batch_size = 16
test_batch_size = 8

train_loader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=test_batch_size,
    shuffle=False
)
test_loader = DataLoader(
    test_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

# 3. Build Models

![Densenet layout structure](public/images/densenet_structure_details.png)

![Densenet Layer](public/images//densenet_layer.png)

In [13]:
class BottleneckBlock(nn.Module):
    def __init__(self, in_channels, growth_rate):
        super(BottleneckBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(in_channels, 4 * growth_rate, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(4 * growth_rate)
        self.conv2 = nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
        self.relu = nn.ReLU()

    def forward(self, x):
        res = x.clone().detach()
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv1(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = torch.cat([res, x], 1)

        return x

In [14]:
class DenseBlock(nn.Module):
    def __init__(self, num_layers, in_channels, growth_rate):
        super(DenseBlock, self).__init__()
        layers = []

        for i in range(num_layers):
            layers.append(BottleneckBlock(in_channels + i * growth_rate, growth_rate))
        
        self.block = nn.Sequential(*layers)

    def forward(self, x):
        return self.block(x)

In [15]:
class DenseNet(nn.Module):
    def __init__(self, num_blocks, growth_rate, num_classes):
        super(DenseNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 2 * growth_rate, kernel_size=7, padding=3, stride=2, bias=False)
        self.bn1 = nn.BatchNorm2d(2 * growth_rate)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.dense_blocks = nn.ModuleList()
        in_channels = 2 * growth_rate
        for i, num_layers in enumerate(num_blocks):
            self.dense_blocks.append(DenseBlock(num_layers, in_channels, growth_rate))
            in_channels += num_layers * growth_rate

            if i != len(num_blocks) - 1:
                out_channels = in_channels // 2
                self.dense_blocks.append(nn.Sequential(
                    nn.BatchNorm2d(in_channels),
                    nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
                    nn.AvgPool2d(kernel_size=2, stride=2)
                ))
                in_channels = out_channels
        self.bn2 = nn.BatchNorm2d(in_channels)
        self.pool2 = nn.AvgPool2d(kernel_size=7)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(in_channels, num_classes)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool1(x)

        for block in self.dense_blocks:
            x = block(x)
        
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool2(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [16]:
n_classes = len(list(classes.keys()))
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = DenseNet(
    [6, 12, 24, 16],
    growth_rate=32,
    num_classes=n_classes
).to(device)

# 4. Evaludate and Train Model

In [17]:
import torch
from tqdm.auto import tqdm
from typing import Dict, List, Tuple
import time

save_model = './model'
if not os.path.exists(save_model):
    os.makedirs(save_model)


  from .autonotebook import tqdm as notebook_tqdm


In [18]:
def evaluate(
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    device: torch.device,
    desc: str = "Validating",
    position: int = 1,
    is_leaving=False
) -> Tuple[float, float]:
    model.eval()
    total_acc = total_loss = total_count = 0

    try:
        with torch.no_grad(), tqdm(
            dataloader,
            desc=desc,
            unit="sample",
            unit_scale=dataloader.batch_size,
            position=position,
            leave=is_leaving,
        ) as pbar:
            for batch_idx, (inputs, labels) in enumerate(pbar):
                MemoryTracker.clear_memory(model)

                try:
                    inputs = safe_to_device(inputs, device)
                    labels = safe_to_device(labels, device)

                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    total_loss += float(loss.detach().item() * labels.size(0))  # Weighted by batch size
                    _, predicted = torch.max(outputs.data, 1)
                    total_count += labels.size(0)
                    total_acc += (predicted == labels).sum().item()

                    del outputs, loss, predicted
                except RuntimeError as e:
                    print(f"\nError in validation batch {batch_idx}: {str(e)}")
                    MemoryTracker.clear_memory(model)
                    continue

                allocated, reserved = MemoryTracker.get_memory_stats()
                pbar.set_postfix({
                    'loss': f'{total_loss/max(1, total_count):.4f}',
                    'acc': f'{100.*total_acc/max(1, total_count):.2f}%',
                    'GPU': f'{allocated:.0f}MB',
                })

    except Exception as e:
        print(f"\nEvaluation error: {str(e)}")
        MemoryTracker.clear_memory(model)
        raise

    
    epoch_loss = total_loss / max(1, total_count)
    epoch_acc = total_acc / max(1, total_count)
    
    return epoch_acc, epoch_loss

In [19]:
def train(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    device: torch.device,
    desc: str = "Training",
    position: int = 1,
) -> Tuple[float, float]:
    model.train()
    total_loss = total_acc = total_count = 0

    try:
        with tqdm(
            dataloader, 
            desc=desc,
            unit="batch",
            total=len(dataloader),
            position=position, 
            leave=True,
        ) as pbar:
            for batch_idx, (inputs, labels) in enumerate(pbar):
                MemoryTracker.clear_memory(model)
                
                try:
                    inputs = safe_to_device(inputs, device)
                    labels = safe_to_device(labels, device)

                    predictions = model(inputs)
                    loss = criterion(predictions, labels)
                    loss.backward()
                    optimizer.step()

                    total_loss += float(loss.item() * labels.size(0))
                    total_acc += (predictions.argmax(1) == labels).sum().item()
                    total_count += labels.size(0)

                    del predictions, loss

                except RuntimeError as e:
                    print(f"\nError in training batch {batch_idx}: {str(e)}")
                    MemoryTracker.clear_memory(model)
                    optimizer.zero_grad()
                    continue

                allocated, reserved = MemoryTracker.get_memory_stats()
                pbar.set_postfix({
                    'loss': f'{total_loss/max(1, total_count):.4f}',
                    'acc': f'{100.*total_acc/max(1, total_count):.2f}%',
                    'GPU': f'{allocated:.0f}MB'
                })

    except Exception as e:
        print(f"\nTraining error: {str(e)}")
        MemoryTracker.clear_memory(model)
        raise

    epoch_loss = total_loss / max(1, total_count)
    epoch_acc = total_acc / max(1, total_count)
    
    return epoch_acc, epoch_loss

In [20]:
def fit(
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    train_loader: torch.utils.data.DataLoader,
    val_loader: torch.utils.data.DataLoader,
    device: torch.device,
    epochs: int,
    early_stopping_patience: int = None,
    scheduler = None,
) -> Dict[str, List[float]]:
    history = {
        'train_acc': [], 'train_loss': [],
        'val_acc': [], 'val_loss': [],
        'epoch_times': [], 'GPU': []
    }
    best_val_loss = float('inf')
    patience_counter = 0

    try:
        with tqdm(range(epochs), desc="Epochs", position=0, leave=True) as epoch_pbar:
            for epoch in epoch_pbar:
                MemoryTracker.clear_memory(model)
                
                try:
                    epoch_start = time.time()

                    train_acc, train_loss = train(
                        model, optimizer, criterion, train_loader, device,
                        desc=f"Epoch {epoch+1}/{epochs} [Train]",
                        position=0,
                    )

                    val_acc, val_loss = evaluate(
                        model, criterion, val_loader, device,
                        desc=f"Epoch {epoch+1}/{epochs} [Val]",
                        position=0,
                        is_leaving=True,
                    )

                    if scheduler is not None:
                        scheduler.step()

                    epoch_time = time.time() - epoch_start
                    allocated, _ = MemoryTracker.get_memory_stats()

                    history['train_acc'].append(train_acc)
                    history['train_loss'].append(train_loss)
                    history['val_acc'].append(val_acc)
                    history['val_loss'].append(val_loss)
                    history['epoch_times'].append(epoch_time)
                    history['GPU'].append(allocated)

                    if early_stopping_patience is not None:
                        if val_loss < best_val_loss:
                            best_val_loss = val_loss
                            patience_counter = 0
                            torch.save(model.state_dict(), save_model + '/resnet_weather_cls.pt')
                        else:
                            patience_counter += 1
                            if patience_counter >= early_stopping_patience:
                                print(f'\nEarly stopping triggered after {epoch + 1} epochs')
                                break

                except Exception as e:
                    print(f"\nError in epoch {epoch + 1}: {str(e)}")
                    MemoryTracker.clear_memory(model)
                    continue
    
    except Exception as e:
        print(f"\nTraining loop error: {str(e)}")
        MemoryTracker.clear_memory(model)

    return history

In [21]:
lr = 1e-4
epochs = 5

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=lr
)
history = fit(
    model,
    criterion,
    optimizer,
    train_loader,
    val_loader,
    device,
    epochs,
)

Epoch 1/5 [Train]: 100%|██████████| 702/702 [22:56<00:00,  1.96s/batch, loss=1.4108, acc=47.52%, GPU=77MB]
Epoch 1/5 [Val]: 100%|██████████| 2808/2808 [03:47<00:00, 12.37sample/s, loss=1.2681, acc=58.64%, GPU=74MB]
Epoch 2/5 [Train]: 100%|██████████| 702/702 [22:32<00:00,  1.93s/batch, loss=1.2861, acc=58.56%, GPU=77MB]
Epoch 2/5 [Val]: 100%|██████████| 2808/2808 [03:33<00:00, 13.18sample/s, loss=1.2547, acc=62.66%, GPU=74MB]
Epoch 3/5 [Train]:  40%|███▉      | 279/702 [07:11<10:54,  1.55s/batch, loss=1.3423, acc=58.13%, GPU=80MB]
Epochs:  40%|████      | 2/5 [1:00:01<1:30:02, 1800.76s/it]


KeyboardInterrupt: 

In [None]:
max_memory = max(history['GPU'])
print(f"Peak GPU memory usage: {max_memory:.2f} MB")

In [None]:
test_acc, test_loss = evaluate(
    model,
    criterion,
    test_loader,
    device
)
test_acc, test_loss