# Lung Nodule Detection Training - Google Colab


## 1. Setup Environment

In [1]:
# Check GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("WARNING: GPU not available! Please change runtime type to GPU")

CUDA available: True
GPU: Tesla T4
CUDA version: 12.8


In [2]:
# Install required packages
!pip install -q SimpleITK

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!ls "/content/drive/MyDrive/"

Mounted at /content/drive
 checkpoints_span_model   LUNA16_data		  subset7.zip
'Colab Notebooks'	  pytorch-retinanet	  table_dataset
 dataset		  RetinaNet_Checkpoints
 FireReducedData	  runs


## 3. Upload Code Files

- config.py
- data_loader.py
- loss.py
- train.py
- GCSAM_CND.py


In [5]:
from google.colab import files

print("Please upload all code files (.py)")
uploaded = files.upload()

print("\nUploaded files:")
!ls -lh *.py

Please upload all code files (.py)


Saving train.py to train.py
Saving loss.py to loss.py
Saving GCSAM_CND.py to GCSAM_CND.py
Saving data_loader.py to data_loader.py
Saving config.py to config.py
Saving check_data.py to check_data.py

Uploaded files:
-rw-r--r-- 1 root root 8.3K Feb 12 14:21 check_data.py
-rw-r--r-- 1 root root 2.6K Feb 12 14:21 config.py
-rw-r--r-- 1 root root  13K Feb 12 14:21 data_loader.py
-rw-r--r-- 1 root root 9.9K Feb 12 14:21 GCSAM_CND.py
-rw-r--r-- 1 root root 6.9K Feb 12 14:21 loss.py
-rw-r--r-- 1 root root 9.3K Feb 12 14:21 train.py


## 4. Setup Data Paths

In [None]:
DATA_ROOT = "/content/drive/MyDrive/LUNA16_data" 

!mkdir -p /content/data

!cp /content/drive/MyDrive/LUNA16_data/subset0.zip /content/data/
!cp /content/drive/MyDrive/LUNA16_data/annotations.csv /content/data/
!ls -lh /content/data

!for f in /content/data/*.zip; do unzip -o "$f" -d /content/data/; done


total 619M
-rw------- 1 root root 134K Feb 12 14:22 annotations.csv
-rw------- 1 root root 619M Feb 12 14:22 subset0.zip
Archive:  /content/data/subset0.zip
   creating: /content/data/subset0/
  inflating: /content/data/subset0/1.3.6.1.4.1.14519.5.2.1.6279.6001.105756658031515062000744821260.mhd  
  inflating: /content/data/subset0/1.3.6.1.4.1.14519.5.2.1.6279.6001.105756658031515062000744821260.raw  
  inflating: /content/data/subset0/1.3.6.1.4.1.14519.5.2.1.6279.6001.108197895896446896160048741492.mhd  
  inflating: /content/data/subset0/1.3.6.1.4.1.14519.5.2.1.6279.6001.108197895896446896160048741492.raw  
  inflating: /content/data/subset0/1.3.6.1.4.1.14519.5.2.1.6279.6001.109002525524522225658609808059.mhd  
  inflating: /content/data/subset0/1.3.6.1.4.1.14519.5.2.1.6279.6001.109002525524522225658609808059.raw  
  inflating: /content/data/subset0/1.3.6.1.4.1.14519.5.2.1.6279.6001.111172165674661221381920536987.mhd  
  inflating: /content/data/subset0/1.3.6.1.4.1.14519.5.2.1.6279.6

## 5. Verify Data (Optional)

In [None]:
!python check_data.py

CHECKING DATA STRUCTURE

✓ Annotations file found
  Total annotations: 1186
  Unique scans: 601

✓ subset0 found
  .mhd files: 8
  .raw files: 8

✓ Found 1 subsets: [0]

CHECKING ANNOTATIONS MATCH

✓ Matched scans: 5/601

First 3 not found:
  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860
  1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793540579077826395208
  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016233746780170740405

✓ Data structure check passed!

SUBSET STATISTICS

Subset 0:
  CT scans: 8
  Total nodules: 6
  Scans with nodules: 5
  Nodule size range: 4.34 - 13.60 mm
  Average size: 7.03 mm

LOADING AND VISUALIZING SAMPLE

Scan ID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860
ERROR: Could not find 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860.mhd


## 6. Configure Training



In [None]:
# Import config
import config as cfg

print("Current config:")
print(f"  Batch size: {cfg.config['batch_size']}")
print(f"  Epochs: {cfg.config['epoch']}")
print(f"  Train subsets: {cfg.config['train_split']}")
print(f"  Val subsets: {cfg.config['val_split']}")
print(f"  Crop size: {cfg.config['crop_size']}")

Configuration loaded successfully!
Current config:
  Batch size: 4
  Epochs: 200
  Train subsets: [0, 1, 2, 3, 4, 5, 6, 7]
  Val subsets: [8, 9]
  Crop size: [128, 128, 128]


In [None]:
# Option 1: Quick test 
cfg.config['train_split'] = [0] 
cfg.config['val_split'] = [0]
cfg.config['batch_size'] = 1
cfg.config['epoch'] = 5
cfg.config['save_freq'] = 2
cfg.config['num_worker'] = 0
print("Config updated for QUICK TEST")
print(f"  Batch size: {cfg.config['batch_size']}")
print(f"  Epochs: {cfg.config['epoch']}")
print(f"  Train subsets: {cfg.config['train_split']}")

Config updated for QUICK TEST
  Batch size: 1
  Epochs: 5
  Train subsets: [0]


In [None]:
# Option 2: Full training
# cfg.config['train_split'] = [0, 1, 2, 3, 4, 5, 6, 7]
# cfg.config['val_split'] = [8, 9]
# cfg.config['batch_size'] = 4
# cfg.config['epoch'] = 200
# cfg.config['save_freq'] = 10

## 7. Start Training

In [None]:
# Import training function
from train import main

# Start training
print("Starting training...")
print("="*60)

try:
    main()
except KeyboardInterrupt:
    print("\nTraining interrupted by user")
except Exception as e:
    print(f"\nError during training: {e}")
    import traceback
    traceback.print_exc()

Starting training...
Using device: cuda
Creating datasets...
train dataset: Found 8 CT scans
val dataset: Found 8 CT scans
Train dataset: 8 scans
Val dataset: 8 scans
Train batches: 8
Val batches: 8
Creating model...
Number of trainable parameters: 863,117

Starting training...

Epoch 1/5
Learning rate: 0.01


Epoch 0: 100%|██████████| 8/8 [00:43<00:00,  5.46s/it, loss=0.5183, cls=0.5183, reg=0.0000, pos=0]



Training metrics:
  Loss: 0.5976
  Cls Loss: 0.5976
  Reg Loss: 0.0000
  Avg Positive: 0.00

Epoch 2/5
Learning rate: 0.01


Epoch 1: 100%|██████████| 8/8 [00:41<00:00,  5.14s/it, loss=0.2249, cls=0.2249, reg=0.0000, pos=0]



Training metrics:
  Loss: 0.3598
  Cls Loss: 0.3598
  Reg Loss: 0.0000
  Avg Positive: 0.00

Epoch 3/5
Learning rate: 0.01


Epoch 2: 100%|██████████| 8/8 [00:43<00:00,  5.49s/it, loss=0.0664, cls=0.0664, reg=0.0000, pos=0]



Training metrics:
  Loss: 0.1165
  Cls Loss: 0.1165
  Reg Loss: 0.0000
  Avg Positive: 0.00

Epoch 4/5
Learning rate: 0.01


Epoch 3: 100%|██████████| 8/8 [00:40<00:00,  5.10s/it, loss=0.0139, cls=0.0139, reg=0.0000, pos=0]



Training metrics:
  Loss: 0.0261
  Cls Loss: 0.0261
  Reg Loss: 0.0000
  Avg Positive: 0.00

Epoch 5/5
Learning rate: 0.01


Epoch 4: 100%|██████████| 8/8 [00:37<00:00,  4.71s/it, loss=0.0062, cls=0.0062, reg=0.0000, pos=0]



Training metrics:
  Loss: 0.0088
  Cls Loss: 0.0088
  Reg Loss: 0.0000
  Avg Positive: 0.00


Val Epoch 4:  38%|███▊      | 3/8 [00:15<00:25,  5.17s/it, loss=0.0424, cls=0.0424, reg=0.0000]

## 9. Save Checkpoints to Drive

In [None]:

!mkdir -p "/content/drive/MyDrive/LUNA16_checkpoints"
!cp -r /content/checkpoints/* "/content/drive/MyDrive/LUNA16_checkpoints/"

print("Checkpoints saved to Google Drive!")
!ls -lh "/content/drive/MyDrive/LUNA16_checkpoints/"

## 10. Resume Training (if disconnect)

In [None]:
# Load checkpoint
import torch
from GCSAM_CND import MyModel
import config as cfg

# Path to checkpoint
checkpoint_path = '/content/drive/MyDrive/LUNA16_checkpoints/checkpoint_epoch_10.pth'

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)

    # Update start epoch
    cfg.config['start_epoch'] = checkpoint['epoch'] + 1

    print(f"Loaded checkpoint from epoch {checkpoint['epoch']}")
    print(f"Resuming from epoch {cfg.config['start_epoch']}")

    # Continue training
    from train import main
    main()
else:
    print(f"Checkpoint not found: {checkpoint_path}")