# 0. Download Dataset

In [1]:
import sys
sys.path.append('../..')
from utils.gdrive_downloader import GDriveDownloader
from utils.memory_tracker import MemoryTracker, safe_to_device

root_dir = './data/weather_cls'

In [2]:
# gdrive_downloader = GDriveDownloader(cache_dir='./cache')
# success, message = gdrive_downloader.download_and_extract(
#     gdrive_url='https://drive.google.com/file/d/1fnJMMw0LvDgl-GS4FTou5qAgLxOE2KQ0/view?usp=drive_link',
#     extract_dir=root_dir,
#     keep_zip=False
# )
# if success:
#     print(f'Success: {message}')
# else:
#     print(f'Error: {message}')

In [3]:
import torch
import torch.nn as nn
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 59
set_seed(seed)

In [5]:
import torch.backends.cudnn as cudnn
cudnn.benchmark = False
cudnn.deterministic = True
cudnn.enabled = False

# 1. Load data

In [6]:
data_dir = os.path.join(root_dir, 'weather-dataset/dataset')
img_paths = []
labels = []
classes = {
    label_idx: class_name \
    for label_idx, class_name in enumerate(
        sorted(os.listdir(data_dir))
    )
}
classes

{0: 'dew',
 1: 'fogsmog',
 2: 'frost',
 3: 'glaze',
 4: 'hail',
 5: 'lightning',
 6: 'rain',
 7: 'rainbow',
 8: 'rime',
 9: 'sandstorm',
 10: 'snow'}

In [7]:
for label_idx, class_name in classes.items():
    class_dir = os.path.join(data_dir, class_name)
    for img_filename in os.listdir(class_dir):
        img_path = os.path.join(class_dir, img_filename)
        img_paths.append(img_path)
        labels.append(label_idx)

In [8]:
# Split train, val, test
val_size = 0.2
test_size = 0.125
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    img_paths, labels,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=seed,
    shuffle=is_shuffle
)

# 2. Preprocessing

In [9]:
class WeatherDataset(Dataset):
    def __init__(
        self,
        X, y,
        transform=None,
    ):
        self.transform = transform
        self.img_paths = X
        self.labels = y

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        img = Image.open(img_path).convert('RGB')

        if self.transform:
            img = self.transform(img)

        return img, self.labels[idx]

In [10]:
def transform(img, img_size=(224, 224)):
    img = img.resize(img_size)
    img = np.array(img)[..., :3]
    img = torch.tensor(img).permute(2, 0, 1).float()
    normalized_img = img / 255.0

    return normalized_img

In [11]:
# Define datasets object

train_dataset = WeatherDataset(
    X_train, y_train,
    transform=transform
)
val_dataset = WeatherDataset(
    X_val, y_val,
    transform=transform
)
test_dataset = WeatherDataset(
    X_test, y_test,
    transform=transform
)

In [12]:
# Define dataloader
train_batch_size = 16
test_batch_size = 8

train_loader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=test_batch_size,
    shuffle=False
)
test_loader = DataLoader(
    test_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

# 3. Build Models

![Resnet layer structure](public/images/resnet_structure.png)

![Residual Block](public/images/residual_connection.png)

In [13]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.batch_norm1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.batch_norm2 = nn.BatchNorm2d(out_channels)

        self.downsample = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )
        self.relu = nn.ReLU()
    
    def forward(self, x):
        shortcut = x.clone()
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.batch_norm2(x)
        x += self.downsample(shortcut)
        x = self.relu(x)

        return x

In [14]:
class ResNet(nn.Module):
    def __init__(self, residual_block, n_blocks_lst, n_classes):
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.batch_norm1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.conv2 = self.create_layer(residual_block, 64, 64, n_blocks_lst[0], 1)
        self.conv3 = self.create_layer(residual_block, 64, 128, n_blocks_lst[1], 2)
        self.conv4 = self.create_layer(residual_block, 128, 256, n_blocks_lst[2], 2)
        self.conv5 = self.create_layer(residual_block, 256, 512, n_blocks_lst[3], 2)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(512, n_classes)

    def create_layer(self, residual_block, in_channels, out_channels, n_blocks, stride):
        blocks = []
        first_block = residual_block(in_channels, out_channels, stride)
        blocks.append(first_block)

        for _ in range(1, n_blocks):
            block = residual_block(out_channels, out_channels, stride)
            blocks.append(block)

        block_sequential = nn.Sequential(*blocks)
        return block_sequential

    def forward(self, x):
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = self.maxpool(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.avgpool(x)
        x = self.flatten(x)
        x = self.fc1(x)

        return x

In [15]:
n_classes = len(list(classes.keys()))
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = ResNet(
    residual_block=ResidualBlock,
    n_blocks_lst=[2, 2, 2, 2],
    n_classes=n_classes
).to(device)

# 4. Evaluate and Train Model

In [16]:
import torch
from tqdm.auto import tqdm
from typing import Dict, List, Tuple
import time

save_model = './model'
if not os.path.exists(save_model):
    os.makedirs(save_model)


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
def evaluate(
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    device: torch.device,
    desc: str = "Validating",
    position: int = 1,
    is_leaving=False
) -> Tuple[float, float]:
    model.eval()
    total_acc = total_loss = total_count = 0

    try:
        with torch.no_grad(), tqdm(
            dataloader,
            desc=desc,
            unit="sample",
            unit_scale=dataloader.batch_size,
            position=position,
            leave=is_leaving,
        ) as pbar:
            for batch_idx, (inputs, labels) in enumerate(pbar):
                MemoryTracker.clear_memory(model)

                try:
                    inputs = safe_to_device(inputs, device)
                    labels = safe_to_device(labels, device)

                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    total_loss += float(loss.detach().item() * labels.size(0))  # Weighted by batch size
                    _, predicted = torch.max(outputs.data, 1)
                    total_count += labels.size(0)
                    total_acc += (predicted == labels).sum().item()

                    del outputs, loss, predicted
                except RuntimeError as e:
                    print(f"\nError in validation batch {batch_idx}: {str(e)}")
                    MemoryTracker.clear_memory(model)
                    continue

                allocated, reserved = MemoryTracker.get_memory_stats()
                pbar.set_postfix({
                    'loss': f'{total_loss/max(1, total_count):.4f}',
                    'acc': f'{100.*total_acc/max(1, total_count):.2f}%',
                    'GPU': f'{allocated:.0f}MB',
                })

    except Exception as e:
        print(f"\nEvaluation error: {str(e)}")
        MemoryTracker.clear_memory(model)
        raise

    
    epoch_loss = total_loss / max(1, total_count)
    epoch_acc = total_acc / max(1, total_count)
    
    return epoch_acc, epoch_loss

In [18]:
def train(
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    device: torch.device,
    desc: str = "Training",
    position: int = 1,
) -> Tuple[float, float]:
    model.train()
    total_loss = total_acc = total_count = 0

    try:
        with tqdm(
            dataloader, 
            desc=desc,
            unit="batch",
            total=len(dataloader),
            position=position, 
            leave=True,
        ) as pbar:
            for batch_idx, (inputs, labels) in enumerate(pbar):
                MemoryTracker.clear_memory(model)
                
                try:
                    inputs = safe_to_device(inputs, device)
                    labels = safe_to_device(labels, device)

                    predictions = model(inputs)
                    loss = criterion(predictions, labels)
                    loss.backward()
                    optimizer.step()

                    total_loss += float(loss.item() * labels.size(0))
                    total_acc += (predictions.argmax(1) == labels).sum().item()
                    total_count += labels.size(0)

                    del predictions, loss

                except RuntimeError as e:
                    print(f"\nError in training batch {batch_idx}: {str(e)}")
                    MemoryTracker.clear_memory(model)
                    optimizer.zero_grad()
                    continue

                allocated, reserved = MemoryTracker.get_memory_stats()
                pbar.set_postfix({
                    'loss': f'{total_loss/max(1, total_count):.4f}',
                    'acc': f'{100.*total_acc/max(1, total_count):.2f}%',
                    'GPU': f'{allocated:.0f}MB'
                })

    except Exception as e:
        print(f"\nTraining error: {str(e)}")
        MemoryTracker.clear_memory(model)
        raise

    epoch_loss = total_loss / max(1, total_count)
    epoch_acc = total_acc / max(1, total_count)
    
    return epoch_acc, epoch_loss

In [19]:
def fit(
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    train_loader: torch.utils.data.DataLoader,
    val_loader: torch.utils.data.DataLoader,
    device: torch.device,
    epochs: int,
    early_stopping_patience: int = None,
    scheduler = None,
) -> Dict[str, List[float]]:
    history = {
        'train_acc': [], 'train_loss': [],
        'val_acc': [], 'val_loss': [],
        'epoch_times': [], 'GPU': []
    }
    best_val_loss = float('inf')
    patience_counter = 0

    try:
        with tqdm(range(epochs), desc="Epochs", position=0, leave=True) as epoch_pbar:
            for epoch in epoch_pbar:
                MemoryTracker.clear_memory(model)
                
                try:
                    epoch_start = time.time()

                    train_acc, train_loss = train(
                        model, optimizer, criterion, train_loader, device,
                        desc=f"Epoch {epoch+1}/{epochs} [Train]",
                        position=0,
                    )

                    val_acc, val_loss = evaluate(
                        model, criterion, val_loader, device,
                        desc=f"Epoch {epoch+1}/{epochs} [Val]",
                        position=0,
                        is_leaving=True,
                    )

                    if scheduler is not None:
                        scheduler.step()

                    epoch_time = time.time() - epoch_start
                    allocated, _ = MemoryTracker.get_memory_stats()

                    history['train_acc'].append(train_acc)
                    history['train_loss'].append(train_loss)
                    history['val_acc'].append(val_acc)
                    history['val_loss'].append(val_loss)
                    history['epoch_times'].append(epoch_time)
                    history['GPU'].append(allocated)

                    if early_stopping_patience is not None:
                        if val_loss < best_val_loss:
                            best_val_loss = val_loss
                            patience_counter = 0
                            torch.save(model.state_dict(), save_model + '/resnet_weather_cls.pt')
                        else:
                            patience_counter += 1
                            if patience_counter >= early_stopping_patience:
                                print(f'\nEarly stopping triggered after {epoch + 1} epochs')
                                break

                except Exception as e:
                    print(f"\nError in epoch {epoch + 1}: {str(e)}")
                    MemoryTracker.clear_memory(model)
                    continue
    
    except Exception as e:
        print(f"\nTraining loop error: {str(e)}")
        MemoryTracker.clear_memory(model)

    return history

In [20]:
lr = 1e-4
epochs = 5

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=lr
)
history = fit(
    model,
    criterion,
    optimizer,
    train_loader,
    val_loader,
    device,
    epochs,
)

Epoch 1/5 [Train]: 100%|██████████| 301/301 [06:54<00:00,  1.38s/batch, loss=1.6644, acc=44.92%, GPU=106MB]
Epoch 1/5 [Val]: 100%|██████████| 1376/1376 [03:00<00:00,  7.62sample/s, loss=1.4752, acc=54.99%, GPU=108MB]
Epoch 2/5 [Train]: 100%|██████████| 301/301 [09:37<00:00,  1.92s/batch, loss=1.4491, acc=51.90%, GPU=106MB]
Epoch 2/5 [Val]: 100%|██████████| 1376/1376 [02:40<00:00,  8.56sample/s, loss=1.4757, acc=55.50%, GPU=108MB]
Epoch 3/5 [Train]: 100%|██████████| 301/301 [06:55<00:00,  1.38s/batch, loss=1.4023, acc=53.79%, GPU=106MB]
Epoch 3/5 [Val]: 100%|██████████| 1376/1376 [02:37<00:00,  8.75sample/s, loss=1.3960, acc=55.21%, GPU=108MB]
Epoch 4/5 [Train]: 100%|██████████| 301/301 [09:16<00:00,  1.85s/batch, loss=1.4095, acc=54.21%, GPU=106MB]
Epoch 4/5 [Val]: 100%|██████████| 1376/1376 [02:45<00:00,  8.34sample/s, loss=1.2939, acc=57.90%, GPU=108MB]
Epoch 5/5 [Train]: 100%|██████████| 301/301 [08:29<00:00,  1.69s/batch, loss=1.3018, acc=56.68%, GPU=106MB]
Epoch 5/5 [Val]: 100%|██

In [23]:
max_memory = max(history['GPU'])
print(f"Peak GPU memory usage: {max_memory:.2f} MB")

Peak GPU memory usage: 104.69 MB


In [22]:
test_acc, test_loss = evaluate(
    model,
    criterion,
    test_loader,
    device
)
test_acc, test_loss



(0.5895196506550219, 1.200914534071821)