In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False


In [2]:
if IN_COLAB:
    !rm -rf *
    !git clone --branch multi-thread-optim https://github.com/felixk525/mai_project1_optimization.git

In [3]:
if IN_COLAB:
    !pip3 install -r mai_project1_optimization/requirements.txt

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import models, transforms
from torchvision.models import *
from plotly import express as px
from collections import Counter
import multiprocessing
import numpy as np
import random
import time
import io

if(IN_COLAB):
    from mai_project1_optimization.modules.dataset import IntelImageClassificationDataset
    from mai_project1_optimization.modules.utility import NotebookPlotter, InferenceSession, Evaluator, ISO_time
    from mai_project1_optimization.modules.trainer import Trainer
else:
    from modules.dataset import IntelImageClassificationDataset
    from modules.utility import NotebookPlotter, InferenceSession, Evaluator, ISO_time
    from modules.trainer import Trainer

torch.manual_seed(1)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True  # for reproducibility
    torch.backends.cudnn.benchmark = False

# Removed support for Tensor Units
# torch.backends.cudnn.allow_tf32 = True
# torch.backends.cuda.matmul.allow_tf32 = True

set_seed(1)

Matplotlib created a temporary cache directory at C:\Users\krahf\AppData\Local\Temp\matplotlib-jle7cujr because the default path (C:\Users\krahf\.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


cuda


https://www.kaggle.com/datasets/puneet6060/intel-image-classification

In [5]:
# labels, values = zip(*Counter([item[1] for item in dataset.train_dataset]).items())
# fig = px.bar(x=labels, y=values, labels={'x': 'Categories', 'y': 'Counts'}, title='Distribution of Classes')
# fig.show()

| n | label |
| --- | --- |
| 0 | buildings |
| 1 | forest |
| 2 | glacier |
| 3 | mountain |
| 4 | sea |
| 5 | street |

NotebookPlotter.plot_dataset_item_interactive(dataset.train_dataset)

In [6]:
freezeLayer = False

dataset = IntelImageClassificationDataset(resize=(150,150))
    
# SqueezeNet 1.1
model = models.squeezenet1_1(weights=SqueezeNet1_1_Weights.DEFAULT)
num_features = model.classifier[1].in_channels
kernel_size = model.classifier[1].kernel_size
if(freezeLayer):
    for param in model.parameters():
        param.requires_grad = False
model.classifier[1] = nn.Conv2d(num_features, 6, kernel_size)
    

In [None]:
# model.load_state_dict(torch.load(f"checkpoints/.pt"))
possible_workers = multiprocessing.cpu_count()
print(f"Number of CPU cores: {possible_workers}, using 2") # Due to tests - overhead or I/O / dataset to small for it to matter maybe
max_worker = possible_workers // 2
possible_workers = 2 #possible_workers // 2


profiler_config_1 = {
    "schedule": torch.profiler.schedule(wait=5, warmup=5, active=10, repeat=1),
    "activities": [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    "log_dir": "runs/profilermthread/2process",#torch.profiler.tensorboard_trace_handler("runs/profilermthread/3process"),
    "record_shapes": True,
    "profile_memory": True,
    "with_stack": True
}

profiler_config_2 = {
    "schedule": torch.profiler.schedule(wait=5, warmup=5, active=10, repeat=1),
    "activities": [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    "log_dir": "runs/profilermthread/1process",#torch.profiler.tensorboard_trace_handler("runs/profilermthread/1process"),
    "record_shapes": True,
    "profile_memory": True,
    "with_stack": True
}

profiler_config_3 = {
    "schedule": torch.profiler.schedule(wait=5, warmup=5, active=10, repeat=1),
    "activities": [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    "log_dir": "runs/profilermthread/nprocess",#torch.profiler.tensorboard_trace_handler("runs/profilermthread/1process"),
    "record_shapes": True,
    "profile_memory": True,
    "with_stack": True
}

if __name__ == '__main__': # Not necessary for this anymore
    pmodel = model
    pmodel2 = model
    
    dataloader = DataLoader(dataset.train_dataset, batch_size=24, shuffle=True, num_workers=possible_workers, pin_memory=True, prefetch_factor= 4, persistent_workers=True)
    trainer = Trainer(model=pmodel, lr=0.001)
    start_time = time.time()
    trainer.train(dataloader, epochs=10, profiler_config=profiler_config_1)
    duration = time.time() - start_time
    print(f"Training with {possible_workers} workers took {duration:.2f} seconds.\n")

    dataloader = DataLoader(dataset.train_dataset, batch_size=24, shuffle=True, pin_memory=True)
    trainer = Trainer(model=model, lr=0.001)
    start_time = time.time()
    trainer.train(dataloader, epochs=10, profiler_config=profiler_config_2)
    duration = time.time() - start_time
    print(f"Training with single-threaded loading took {duration:.2f} seconds.")

    if max_worker > 2:
        dataloader = DataLoader(dataset.train_dataset, batch_size=24, shuffle=True, num_workers=max_worker, pin_memory=True, prefetch_factor= 4, persistent_workers=True)
        trainer = Trainer(model=pmodel2, lr=0.001)
        start_time = time.time()
        trainer.train(dataloader, epochs=10, profiler_config=profiler_config_3)
        duration = time.time() - start_time
        print(f"Training with {max_worker} workers took {duration:.2f} seconds.")

Number of CPU cores: 12, using 2


  0%|          | 0/10 [00:00<?, ?it/s]

Training with 2 workers took 75.73 seconds.



  0%|          | 0/10 [00:00<?, ?it/s]

Training with single-threaded loading took 156.77 seconds.


  0%|          | 0/10 [00:00<?, ?it/s]

Training with 6 workers took 88.90 seconds.


In [8]:
session = InferenceSession(model)
output = session(torch.stack(tuple(item[0] for item in dataset.test_dataset)))
Evaluator.acc(output, torch.tensor(tuple(item[1] for item in dataset.test_dataset))).item()


0.8858695030212402

In [9]:
# torch.save(model.state_dict(), f"checkpoints/{model.__class__.__name__}.pt")

## Initial Results for Model Selection

| model | accuracy | size |
| --- | --- | --- |
| ResNet18 | 0.87 | 44.7 MB |
| ResNet34 | 0.88 | 83.3 MB |
| MobileNet V2 | 0.91 | 13.6 MB |
| MobileNet V3 small | 0.90 | 9.8 MB |
| VGG19 | 0.83 | 548.1 MB |
| SqueezeNet 1.0 | 0.89 | 4.8 MB |
| DenseNet | 0.90 | 30.8 MB |
| EfficientNet B0 | 0.92 | 20.5 MB |
| ViT-b/16 | 0.73 | 330.3 MB |