In [1]:
import torch
import torch.utils.data
from torchvision import datasets
from torchvision.transforms import v2
from training_utils.ring_mask_converter import process
from training_utils.utils import eval_map
from training_utils.model_builder import get_model
import json
from tqdm.notebook import tqdm
import gc
from training_utils.utils import get_nvidia_gpu_memory


with open('training_configs.json', 'r') as file:
    configs = json.load(file)

USE_GPU = True
dtype = torch.float32 

device = torch.device('cuda:0' if USE_GPU and torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

transforms = v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])

train_images_path = configs['train_images_path']
train_annotations_path = train_images_path + '/result.json'

val_images_path = configs['val_images_path']
val_annotations_path = val_images_path + '/result.json'

train_dataset = datasets.CocoDetection(train_images_path, train_annotations_path, transforms=transforms)
train_dataset = datasets.wrap_dataset_for_transforms_v2(train_dataset, target_keys=["boxes", "labels", "masks"])

val_dataset = datasets.CocoDetection(val_images_path, val_annotations_path, transforms=transforms)
val_dataset = datasets.wrap_dataset_for_transforms_v2(val_dataset, target_keys=["boxes", "labels", "masks"])

model_path = configs['model_path'] 
num_classes = len(['Seed','Interior','Endosperm','Void']) + 1 # add a class for background
model = get_model(num_classes, model_path=model_path).to(device) if configs['load_from_checkpoint'] else get_model(num_classes, model_path=None).to(device)

optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=configs['train_batch_size'], collate_fn=lambda batch: tuple(zip(*batch)),)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=configs['val_batch_size'], collate_fn=lambda batch: tuple(zip(*batch)),)

print_every = configs['print_every']

Using device: cuda:0
loading annotations into memory...
Done (t=12.81s)
creating index...
index created!
loading annotations into memory...
Done (t=3.92s)
creating index...
index created!


In [11]:
"""

optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)

images, targets = next(iter(train_loader))
model.train()

images, targets = process(images, targets, device)
loss_dict = model(images, targets)

loss = sum(tuple([loss for loss in loss_dict.values()]))
optimizer.zero_grad()
loss.backward()
"""
optimizer.step()
"""
losses.append(loss.item())


images_val, targets_val = next(iter(val_loader))
images_val, target_val = process(images_val, targets_val, device)
model.eval()
preds_val = model(images_val)
val_mAP = eval_map(preds_val, targets_val, device)

preds_train = model(images)
train_mAP = eval_map(preds_train, targets, device)


"""
pass

In [12]:
!nvidia-smi

Tue Mar 26 14:11:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A30                     Off | 00000000:01:00.0 Off |                   On |
| N/A   33C    P0              30W / 165W |   1874MiB / 24576MiB |     N/A      Default |
|                                         |                      |              Enabled |
+-----------------------------------------+----------------------+----------------------+

+------------------------------------------------------------------

 - STEP - TOTAL USAGE
 - Model Only - 316MiB
 - Process images and targets - 316MiB
 - Model Forward Pass - 1652MiB
 - Sum Losses - 1652MiB
 - Zero Optim - 1652MiB
 - Backward Pass - 1816MiB
 - Optim step - 1856MiB
 - Garbage Collect - 1856MiB
 - Loading val data - 1856MiB
 - Process val data - 1856MiB
 - Model eval mode - 1856MiB
 - Val Forward Pass - 1958MiB
 - eval mAP - 1958MiB
 - Train forward pass - 2650MiB
 - Garbage Collection - 2650MiB
 - Empty Cache - 2412MiB

In [2]:
import torch
from peft import LoraConfig, get_peft_model

def get_conv_layer_names(model):
    conv_layer_names = []
    for name, module in model.named_modules():
        if isinstance(module, (torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d)):
            conv_layer_names.append(name)
    return conv_layer_names

# Suppose 'model' is your pre-defined PyTorch model
conv_layers = get_conv_layer_names(model)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total Parameters without LoRA: {total_params}")
print(f"Trainable Parameters without LoRA: {trainable_params}")

# Define LoRA Configuration with convolutional layers as target modules
lora_config = LoraConfig(
    r=3,  # Rank of the adaptation
    lora_alpha=3,  # Scaling factor
    lora_dropout=0.1,  # Optional: Dropout rate for LoRA weights
    task_type="IMAGE_SEGMENTATION",  # Adjust according to your task
    target_modules=conv_layers  # Specify convolutional layers as target modules
)

# Apply LoRA to your model
lora_model = get_peft_model(model, lora_config)

total_params = sum(p.numel() for p in lora_model.parameters())
trainable_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)

print(f"Total Parameters with LoRA: {total_params}")
print(f"Trainable Parameters with LoRA: {trainable_params}")

Total Parameters without LoRA: 43938541
Trainable Parameters without LoRA: 43716141
Total Parameters with LoRA: 44262946
Trainable Parameters with LoRA: 324405


 - STEP - TOTAL USAGE
 - Model Only - 320MiB
 - Process images and targets - 320MiB
 - Model Forward Pass - 2992MiB
 - Backward Pass - 3016MiB
 - Optim step - 3016MiB
 - Forward pass of val - 3104MiB

In [19]:
from training_utils.utils import get_nvidia_gpu_memory

def run_one_pass(model, batch_size):
    printing = True
    training = True
    if printing:
        print_fn = lambda x: print(f"{x} {get_nvidia_gpu_memory()[0]['used']}")
    else:
        print_fn = lambda x: x
    if training:
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=lambda batch: tuple(zip(*batch)),)
        print_fn("Loader. Mem:")

        # Instead of having just one optimizer, we will have a `⁠ dict ⁠` of optimizers
        # for every parameter so we could reference them in our hook.
        optimizer_dict = {p: torch.optim.Adam([p], foreach=False) for p in model.parameters()}
        model.train()
        # Define our hook, which will call the optimizer `⁠ step() ⁠` and `⁠ zero_grad() ⁠`
        def optimizer_hook(parameter) -> None:
            optimizer_dict[parameter].step()
            optimizer_dict[parameter].zero_grad()
        
        # Register the hook onto every parameter
        for p in model.parameters():
            p.register_post_accumulate_grad_hook(optimizer_hook)

        # Now remember our previous `⁠ train() ⁠` function? Since the optimizer has been
        # fused into the backward, we can remove the optimizer step and zero_grad calls.
        def train(model):
            # create our fake image input: tensor shape is batch_size, channels, height, width
            images, targets = next(iter(train_loader))
            images, targets = process(images, targets, device)
            # call our forward and backward
            loss_dict = model.forward(fake_image)
            #loss.sum().backward()
            
            loss = sum(tuple([loss for loss in loss_dict.values()]))
            loss.backward()
            # optimizer update --> no longer needed!
            # optimizer.step()
            # optimizer.zero_grad()
    
        #optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)
        #print_fn("Optimizer. Mem:")
        
        model.train()
        #print_fn("Load Data. Mem:")
        #images, targets = process(images, targets, device)
        #print_fn("Process Data. Mem:")
        #loss_dict = model(images, targets)
        #print_fn("Forward Pass. Mem:")
        #loss = sum(tuple([loss for loss in loss_dict.values()]))
        #optimizer.zero_grad()
        #loss.backward()
        #print_fn("Backward Pass. Mem:")
        #optimizer.step()
        #print_fn("Step. Mem:")
        train(model)
    else:
        with torch.no_grad():
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=lambda batch: tuple(zip(*batch)),)
            print_fn("Loader. Mem:")
            images, targets = next(iter(train_loader))
            model.eval()
            print_fn("Load Data. Mem:")
            images, targets = process(images, targets, device)
            print_fn("Process Data. Mem:")
            model(images)
            print_fn("Forward Pass. Mem:")


        
mems = []
sizes = []
import gc
import time
for batch_size in range(1, 5):
    gc.collect()
    torch.cuda.empty_cache()
    start_mem = get_nvidia_gpu_memory()[0]['used']
    start_time = time.time()
    run_one_pass(model, batch_size)
    end_time = time.time()
    end_mem = get_nvidia_gpu_memory()[0]['used']
    gc.collect()
    torch.cuda.empty_cache()
    print(f"Batch Size: {batch_size} | Start Memory: {start_mem} | End Memory: {end_mem} | Time: {end_time - start_time}")
    sizes.append(batch_size)
    mems.append((end_mem - start_mem)/1024)
import matplotlib.pyplot as plt
plt.plot(sizes, mems, 'x',label='No Lora')
plt.xlabel('Batch Size - Images')
plt.ylabel('Memory - GiB')


mems = []
sizes = []
for batch_size in range(1, 5):
    gc.collect()
    torch.cuda.empty_cache()
    start_mem = get_nvidia_gpu_memory()[0]['used']
    start_time = time.time()
    run_one_pass(lora_model, batch_size)
    end_time = time.time()
    end_mem = get_nvidia_gpu_memory()[0]['used']
    gc.collect()
    torch.cuda.empty_cache()
    print(f"Batch Size: {batch_size} | Start Memory: {start_mem} | End Memory: {end_mem} | Time: {end_time - start_time}")
    sizes.append(batch_size)
    mems.append((end_mem - start_mem)/1024)
import matplotlib.pyplot as plt
plt.plot(sizes, mems, 'o', label='Lora')
plt.legend()

Loader. Mem: 528


RuntimeError: cannot register a hook on a tensor that doesn't require gradient

In [6]:
import torch

def calculate_memory(obj):
    """
    Recursively calculates the total memory required to store all tensors
    in a nested structure consisting of lists, dicts, or tuples.

    Args:
    - obj: The nested structure containing tensors.

    Returns:
    - The total memory required to store the tensors, in bytes.
    """
    if torch.is_tensor(obj):
        return obj.nelement() * obj.element_size()
    elif isinstance(obj, dict):
        return sum(calculate_memory(v) for v in obj.values())
    elif isinstance(obj, (list, tuple)):
        return sum(calculate_memory(item) for item in obj)
    else:
        return 0  # Non-tensor objects are not counted


Total memory required: 40520 bytes


In [None]:
"""

optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)

images, targets = next(iter(train_loader))
model.train()

images, targets = process(images, targets, device)
loss_dict = model(images, targets)

loss = sum(tuple([loss for loss in loss_dict.values()]))
optimizer.zero_grad()
loss.backward()
"""
optimizer.step()
"""
losses.append(loss.item())


images_val, targets_val = next(iter(val_loader))
images_val, target_val = process(images_val, targets_val, device)
model.eval()
preds_val = model(images_val)
val_mAP = eval_map(preds_val, targets_val, device)

preds_train = model(images)
train_mAP = eval_map(preds_train, targets, device)


"""
pass