In [1]:
from pathlib import Path
from fastai.vision.all import *
import gc

## Memory and gradient accumulation

In [2]:
path = Path.cwd()
datapath = path/'data'

In [3]:
tst_files = get_image_files(datapath/'test_images').sorted()

In [4]:
# Find the category with the least images for fast testing
df = pd.read_csv(datapath/'train.csv')
df.label.value_counts()

label
normal                      1764
blast                       1738
hispa                       1594
dead_heart                  1442
tungro                      1088
brown_spot                   965
downy_mildew                 620
bacterial_leaf_blight        479
bacterial_leaf_streak        380
bacterial_panicle_blight     337
Name: count, dtype: int64

In [9]:
trn_path = datapath/'train_images'/'bacterial_panicle_blight'

In [5]:
# bs=64//accum : we set the batch size based on 64 int div (//) accum(ulation)
# But the smaller the batch size, the more volatility, and you need different learning rate
# So we use Gradient Accumulation, where we add all the coefficients without zeroing them out
# until the batch size reaches the one we want,
# and then we subtract them using the lr and zeroing them out
# see next cell example

def train(arch, size, item=Resize(480, method='squish'), accum=1, finetune=True, epochs=12):
    dls = ImageDataLoaders.from_folder(trn_path, valid_pct=0.2, item_tfms=item,
        batch_tfms=aug_transforms(size=size, min_scale=0.75), bs=64//accum)
    cbs = GradientAccumulation(64) if accum else [] # This is the call back that the lerner will use
    learn = vision_learner(dls, arch, metrics=error_rate, cbs=cbs).to_fp16() # see cbs=cbs
    if finetune:
        learn.fine_tune(epochs, 0.01)
        return learn.tta(dl=dls.test_dl(tst_files))
    else:
        learn.unfreeze()
        learn.fit_one_cycle(epochs, 0.01)

In [None]:
# Example of how Gradient Accumulation works (do not run here)
# Instead of doing
for x, y in dl:
    calc_loss(coeffs, x, y).backward()
    coeffs.data.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()

# We do
count = 0 # track count of items since last weight update
for x, y in dl: # update count based on this minibatch size
    count += len(x)
    calc_loss(coeffs, x, y).backward()
    if count > 64: # count is > accumulation target, do weight update
        coeffs.data.sub_(coeffs.grad * lr)
        coeffs.grad.zero_()
        count = 0 # reset count

In [19]:
# accum=1 means that the weights will be updated for the whole mini batch
train('convnext_small_in22k', 128, epochs=1, accum=1, finetune=False)

  model = create_fn(


epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:03


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [21]:
# cuda list_gpu_processes did not work on my PC, so I used nvidia-smi
# Here we are trying to keep GPU memory usage under a specific limit by changing accum
import subprocess

def list_gpu_processes_with_nvidia_smi():
    try:
        # Run the nvidia-smi command to get full output
        result = subprocess.run(
            ['nvidia-smi'], 
            stdout=subprocess.PIPE, 
            stderr=subprocess.PIPE
        )
        
        # Check if the command was successful
        if result.returncode != 0:
            print("Error executing nvidia-smi command:")
            print(result.stderr.decode('utf-8'))
            return

        # Decode the output
        output = result.stdout.decode('utf-8')
        
        # Print the raw output to debug
        print(output)
        
        # Parse the output manually to extract relevant details
        lines = output.split('\n')
        process_section = False
        processes = []
        
        for line in lines:
            if 'Processes:' in line:
                process_section = True
                continue
            
            if process_section and '===== ' in line:
                break  # End of the processes section
            
            if process_section and line.strip():
                processes.append(line.strip())
        
        if processes:
            print("GPU Processes:")
            for process in processes:
                print(process)
        else:
            print("No GPU processes found.")
            
    except Exception as e:
        print(f"An error occurred: {e}")


In [6]:
def list_gpu_memory_usage():
    try:
        # Run the nvidia-smi command to get full output
        result = subprocess.run(
            ['nvidia-smi'], 
            stdout=subprocess.PIPE, 
            stderr=subprocess.PIPE
        )
        
        # Check if the command was successful
        if result.returncode != 0:
            print("Error executing nvidia-smi command:")
            print(result.stderr.decode('utf-8'))
            return

        # Decode the output
        output = result.stdout.decode('utf-8')
        
        # Print the raw output to debug
        # print(output)
        
        # Parse the output manually to extract the memory usage section
        lines = output.split('\n')
        memory_usage_section = False
        memory_usage_lines = []

        for line in lines:
            if 'GPU' in line and 'Memory-Usage' in line:
                memory_usage_section = True

            if memory_usage_section:
                memory_usage_lines.append(line)
                if '+------------------------+' in line:
                    break
        
        if memory_usage_lines:
            for line in memory_usage_lines:
                print(line)
        else:
            print("Memory usage information not found.")
            
    except Exception as e:
        print(f"An error occurred: {e}")

# Execute the function
list_gpu_memory_usage()


An error occurred: name 'subprocess' is not defined


In [7]:
def report_gpu():
    # CUDA returned an error here, used the nvidia-smi version
    # print(torch.cuda.list_gpu_processes())
    list_gpu_memory_usage()
    gc.collect()
    torch.cuda.empty_cache()

In [27]:
report_gpu()

| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:07:00.0  On |                  N/A |
| 51%   30C    P8              8W /  200W |     904MiB /  12282MiB |      4%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+


In [28]:
# accum=2 means that the weights will be updated for every 2 half sized mini batches
train('convnext_small_in22k', 128, epochs=1, accum=2, finetune=False)
report_gpu()

epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:03


| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:07:00.0  On |                  N/A |
| 52%   31C    P3             24W /  200W |    2763MiB /  12282MiB |     16%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+


In [48]:
train('convnext_small_in22k', 128, epochs=1, accum=4, finetune=False)
report_gpu()

epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:05


| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:07:00.0  On |                  N/A |
| 53%   33C    P2             45W /  200W |    8689MiB /  12282MiB |     62%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+


## Checking memory use

In [30]:
train('convnext_large_in22k', 224, epochs=1, accum=2, finetune=False)
report_gpu()

  model = create_fn(


model.safetensors:   0%|          | 0.00/919M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:04


| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:07:00.0  On |                  N/A |
| 54%   36C    P2             93W /  200W |   10648MiB /  12282MiB |     44%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+


In [31]:
train('convnext_large_in22k', (320,240), epochs=1, accum=2, finetune=False)
report_gpu()

epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:23


| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:07:00.0  On |                  N/A |
| 55%   37C    P2             59W /  200W |   11957MiB /  12282MiB |     75%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+


In [32]:
train('vit_large_patch16_224', 224, epochs=1, accum=2, finetune=False)
report_gpu()

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:07


  x = F.scaled_dot_product_attention(


| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:07:00.0  On |                  N/A |
| 55%   38C    P2             93W /  200W |   11905MiB /  12282MiB |     62%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+


In [51]:
report_gpu()

| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:07:00.0  On |                  N/A |
| 52%   31C    P8              8W /  200W |    8715MiB /  12282MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+


In [None]:
# I could not run this model (Out of Memory)
train('swin_large_patch4_window7_224', 224, epochs=1, accum=4, finetune=False)
report_gpu()

## Running the models

In [None]:
res = 640,480

In [None]:
# The idea here is to run an enseble of all the above models
models = {
    'convnext_large_in22k': {
        (Resize(res), 224),
        (Resize(res), (320,224)),
    }, 'vit_large_patch16_224': {
        (Resize(480, method='squish'), 224),
        (Resize(res), 224),
    }, 'swinv2_large_window12_192_22k': {
        (Resize(480, method='squish'), 192),
        (Resize(res), 192),
    }, 'swin_large_patch4_window7_224': {
        (Resize(480, method='squish'), 224),
        (Resize(res), 224),
    }
}

In [None]:
trn_path = datapath/'train_images'

In [None]:
tta_res = []

for arch,details in models.items():
    for item,size in details:
        print('---',arch)
        print(size)
        print(item.name)
        tta_res.append(train(arch, size, item=item, accum=2)) #, epochs=1))
        gc.collect()
        torch.cuda.empty_cache()

## Ensembling

In [None]:
save_pickle('tta_res.pkl', tta_res)

In [None]:
tta_prs = first(zip(*tta_res))

In [None]:
# Doubling the weights of the vit models (hack), as they were better
tta_prs += tta_prs[2:4]

In [None]:
# Get the mean of all the ensebled results
avg_pr = torch.stack(tta_prs).mean(0)
avg_pr.shape

In [None]:
dls = ImageDataLoaders.from_folder(trn_path, valid_pct=0.2, item_tfms=Resize(480, method='squish'),
    batch_tfms=aug_transforms(size=224, min_scale=0.75))

In [None]:
idxs = avg_pr.argmax(dim=1)
vocab = np.array(dls.vocab)
ss = pd.read_csv(datapath/'sample_submission.csv')
ss['label'] = vocab[idxs]
ss.to_csv('subm_03.csv', index=False)