# False Positive Reduction Network (Stage 2) - Google Colab

Stage 2: Binary classification to classify candidate nodules to true/false positives

- Input: 32√ó32√ó32 patches from candidates.csv
- Task: Binary classification (nodule vs non-nodule)
- Challenge: Severe class imbalance (~500:1)



## 1. Setup Environment

In [None]:
# Check GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("‚ö†Ô∏è WARNING: GPU not available! Please change runtime type to GPU")

CUDA available: True
GPU: Tesla T4
CUDA version: 12.8


In [None]:
# Install dependencies
!pip install -q SimpleITK scikit-image

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Check Drive contents
!ls "/content/drive/MyDrive/" | head -20

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
A5 - group6
Colab Notebooks
LUNA16_data


## 3. Upload Code Files

- config_fpr.py
- data_loader_fpr.py
- loss_fpr.py
- train_fpr.py
- GCSAM_FPR.py
- preprocessing.py (optional)

In [None]:
from google.colab import files

print("üìÅ Please upload all Stage 2 code files (.py)")
print("Required: config_fpr.py, data_loader_fpr.py, loss_fpr.py, train_fpr.py, GCSAM_FPR.py")
print("Optional: preprocessing.py (if use_lung_mask=True)")
print()

uploaded = files.upload()

print("\n‚úÖ Uploaded files:")
!ls -lh *.py

üìÅ Please upload all Stage 2 code files (.py)
Required: config_fpr.py, data_loader_fpr.py, loss_fpr.py, train_fpr.py, GCSAM_FPR.py
Optional: preprocessing.py (if use_lung_mask=True)



Saving train_fpr.py to train_fpr.py
Saving preprocessing.py to preprocessing.py
Saving loss_fpr.py to loss_fpr.py
Saving GCSAM_FPR.py to GCSAM_FPR.py
Saving data_loader_fpr.py to data_loader_fpr.py
Saving config_fpr.py to config_fpr.py

‚úÖ Uploaded files:
-rw-r--r-- 1 root root 2.3K Feb 13 17:22 config_fpr.py
-rw-r--r-- 1 root root 9.5K Feb 13 17:22 data_loader_fpr.py
-rw-r--r-- 1 root root 6.8K Feb 13 17:22 GCSAM_FPR.py
-rw-r--r-- 1 root root 5.7K Feb 13 17:22 loss_fpr.py
-rw-r--r-- 1 root root 8.9K Feb 13 17:22 preprocessing.py
-rw-r--r-- 1 root root 9.4K Feb 13 17:22 train_fpr.py


## 4. Setup Data Paths

```
LUNA16_data/
‚îú‚îÄ‚îÄ subset0/
‚îÇ   ‚îú‚îÄ‚îÄ xxx.mhd
‚îÇ   ‚îú‚îÄ‚îÄ xxx.raw
‚îÇ   ‚îî‚îÄ‚îÄ ...
‚îú‚îÄ‚îÄ subset1/
‚îÇ   ‚îî‚îÄ‚îÄ ...
‚îî‚îÄ‚îÄ candidates.csv
```

In [None]:

DATA_ROOT = "/content/drive/MyDrive/LUNA16_data"


!mkdir -p /content/data

!cp /content/drive/MyDrive/LUNA16_data/subset1.zip /content/data/
!cp /content/drive/MyDrive/LUNA16_data/subset3.zip /content/data/
!cp /content/drive/MyDrive/LUNA16_data/subset5.zip /content/data/
!cp /content/drive/MyDrive/LUNA16_data/subset7.zip /content/data/
!cp /content/drive/MyDrive/LUNA16_data/candidates.csv /content/data/
!ls -lh /content/data

!for f in /content/data/*.zip; do unzip -o "$f" -d /content/data/; done

print(f"üìÇ Data linked from: {DATA_ROOT}")
print("\nChecking data structure:")
!ls -la /content/data

total 25G
-rw------- 1 root root  69M Feb 13 17:28 candidates.csv
drwxr-xr-x 2 root root 4.0K Feb 13 17:21 subset1
-rw------- 1 root root 5.9G Feb 13 17:23 subset1.zip
-rw------- 1 root root 6.5G Feb 13 17:24 subset3.zip
-rw------- 1 root root 6.2G Feb 13 17:26 subset5.zip
-rw------- 1 root root 5.9G Feb 13 17:28 subset7.zip
Archive:  /content/data/subset1.zip
  inflating: /content/data/subset1/1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866.mhd  
  inflating: /content/data/subset1/1.3.6.1.4.1.14519.5.2.1.6279.6001.100684836163890911914061745866.raw  
  inflating: /content/data/subset1/1.3.6.1.4.1.14519.5.2.1.6279.6001.104562737760173137525888934217.mhd  
  inflating: /content/data/subset1/1.3.6.1.4.1.14519.5.2.1.6279.6001.104562737760173137525888934217.raw  
  inflating: /content/data/subset1/1.3.6.1.4.1.14519.5.2.1.6279.6001.106719103982792863757268101375.mhd  
  inflating: /content/data/subset1/1.3.6.1.4.1.14519.5.2.1.6279.6001.106719103982792863757268101375.raw  
 

## 5. Verify Candidates File

**format like this:**
```
seriesuid,coordX,coordY,coordZ,class
1.3.6.1...,104.08,-211.76,-227.02,1
1.3.6.1...,-128.98,-175.18,-298.51,0
```

In [None]:
import pandas as pd
import os

# Check candidates file
candidates_path = '/content/data/candidates.csv'

if os.path.exists(candidates_path):
    print("‚úÖ candidates.csv found!")

    df = pd.read_csv(candidates_path)
    print(f"\nTotal candidates: {len(df):,}")
    print(f"Columns: {list(df.columns)}")

    # Class distribution
    print("\nClass distribution:")
    print(df['class'].value_counts())

    print("\nFirst 5 rows:")
    print(df.head())

    # Check class imbalance ratio
    n_pos = (df['class'] == 1).sum()
    n_neg = (df['class'] == 0).sum()
    ratio = n_neg / n_pos if n_pos > 0 else 0
    print(f"\n‚ö†Ô∏è Class imbalance ratio: {ratio:.1f}:1 (negative:positive)")
    print("   This is why we need weighted loss and class balancing!")

else:
    print("‚ùå ERROR: candidates.csv not found!")
    print(f"   Expected path: {candidates_path}")
    print("\n   Please make sure you have candidates.csv in your data folder")

‚úÖ candidates.csv found!

Total candidates: 754,975
Columns: ['seriesuid', 'coordX', 'coordY', 'coordZ', 'class']

Class distribution:
class
0    753418
1      1557
Name: count, dtype: int64

First 5 rows:
                                           seriesuid     coordX      coordY  \
0  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  68.420000  -74.480000   
1  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -95.209361  -91.809406   
2  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -24.766755 -120.379294   
3  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -63.080000  -65.740000   
4  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  52.946688  -92.688873   

       coordZ  class  
0 -288.700000      0  
1 -377.426350      0  
2 -273.361539      0  
3 -344.240000      0  
4 -241.067872      0  

‚ö†Ô∏è Class imbalance ratio: 483.9:1 (negative:positive)
   This is why we need weighted loss and class balancing!


## 6. Configure Training

ƒêi·ªÅu ch·ªânh c√°c parameters quan tr·ªçng

In [None]:
# Import config
import config_fpr as cfg

# Xem config hi·ªán t·∫°i
print("üìã Current configuration:")
print(f"  Batch size: {cfg.config['batch_size']}")
print(f"  Epochs: {cfg.config['epoch']}")
print(f"  Learning rate: {cfg.config['lr_stage1']}")
print(f"  Train subsets: {cfg.config['train_split']}")
print(f"  Val subsets: {cfg.config['val_split']}")
print(f"  Crop size: {cfg.config['crop_size']}")
print(f"\nüéØ Class imbalance handling:")
print(f"  Pos augmentation factor: {cfg.config['pos_augmentation_factor']}x")
print(f"  Neg sample ratio: {cfg.config['neg_sample_ratio']*100}%")
print(f"  Pos weight: {cfg.config['pos_weight']}")
print(f"\nüìä Loss function:")
print(f"  Use weighted loss: {cfg.config['use_weighted_loss']}")
print(f"  Use focal loss: {cfg.config['use_focal_loss']}")

FPR Configuration loaded successfully!
üìã Current configuration:
  Batch size: 8
  Epochs: 50
  Learning rate: 0.01
  Train subsets: [0, 1, 2, 3, 4, 5, 6, 7]
  Val subsets: [8, 9]
  Crop size: [32, 32, 32]

üéØ Class imbalance handling:
  Pos augmentation factor: 20x
  Neg sample ratio: 3.0%
  Pos weight: 10.0

üìä Loss function:
  Use weighted loss: True
  Use focal loss: False


### 6.1 Quick Test Config

In [None]:
cfg.config['train_split'] = [0]
cfg.config['val_split'] = [0]
cfg.config['batch_size'] = 4
cfg.config['epoch'] = 5
cfg.config['save_freq'] = 2

# Gi·∫£m augmentation ƒë·ªÉ test nhanh
cfg.config['pos_augmentation_factor'] = 5
cfg.config['neg_sample_ratio'] = 0.05

# Preprocessing
cfg.config['use_lung_mask'] = False  # False = faster

print("‚úÖ Config updated for QUICK TEST")
print(f"  Epochs: {cfg.config['epoch']}")
print(f"  Batch size: {cfg.config['batch_size']}")
print(f"  Train/Val subsets: {cfg.config['train_split']}")

‚úÖ Config updated for QUICK TEST
  Epochs: 5
  Batch size: 4
  Train/Val subsets: [0]


### 6.2 Full Training Config

In [None]:
# # Full training config
cfg.config['train_split'] = [1, 3, 5]
cfg.config['val_split'] = [7]
cfg.config['batch_size'] = 4
cfg.config['epoch'] = 20
cfg.config['save_freq'] = 2

# # Class balancing
cfg.config['pos_augmentation_factor'] = 20
cfg.config['neg_sample_ratio'] = 0.05
cfg.config['pos_weight'] = 10.0

# print("‚úÖ Config updated for FULL TRAINING")

## 7. Test Data Loading


In [None]:
from data_loader_fpr import CandidateDataset

print("üì¶ Creating test dataset...")
test_dataset = CandidateDataset(
    data_dir='/content/data',
    candidates_file='/content/data/candidates.csv',
    subset_ids=cfg.config['train_split'],
    config=cfg.config,
    phase='train'
)

print(f"\n‚úÖ Dataset created successfully!")
print(f"   Total samples: {len(test_dataset)}")

# Test loading one sample
if len(test_dataset) > 0:
    print("\nüîç Loading sample 0...")
    sample = test_dataset[0]
    print(f"   Image shape: {sample['image'].shape}")
    print(f"   Label: {sample['label']} ({'Nodule' if sample['label']==1 else 'Non-nodule'})")
    print(f"   SeriesUID: {sample['seriesuid'][:50]}...")
    print(f"   Coordinates: {sample['coord']}")
else:
    print("‚ö†Ô∏è Dataset is empty! Check your data path and subset IDs.")

üì¶ Creating test dataset...
Total candidates in file: 754975
Candidates after filtering by available scans: 222653
Positive samples: 455
Negative samples: 222198

After balancing (training):
  Positive samples: 9100
  Negative samples: 11109
train dataset: 20209 samples

‚úÖ Dataset created successfully!
   Total samples: 20209

üîç Loading sample 0...
   Image shape: torch.Size([1, 32, 32, 32])
   Label: 1 (Nodule)
   SeriesUID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.1288818003997025...
   Coordinates: [ -78.59614041   67.02944433 -168.5698643 ]


## 8. Test Model Architecture

In [None]:
from GCSAM_FPR import MyModel

# Create model
model = MyModel(num_classes=2)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("üîß Model Architecture:")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")

# Test forward pass
dummy_input = torch.randn(2, 1, 32, 32, 32)  # Batch of 2
output = model(dummy_input)
print(f"\n‚úÖ Forward pass successful!")
print(f"   Input shape: {dummy_input.shape}")
print(f"   Output shape: {output.shape} (should be [2, 2] for 2 classes)")

üîß Model Architecture:
   Total parameters: 2,363,935
   Trainable parameters: 2,363,935

‚úÖ Forward pass successful!
   Input shape: torch.Size([2, 1, 32, 32, 32])
   Output shape: torch.Size([2, 2]) (should be [2, 2] for 2 classes)


## 9. Start Training

In [None]:
from train_fpr import main

print("üöÄ Starting training...")
print("="*60)

try:
    main()
    print("\n‚úÖ Training completed successfully!")
except KeyboardInterrupt:
    print("\n‚ö†Ô∏è Training interrupted by user")
except Exception as e:
    print(f"\n‚ùå Error during training: {e}")
    import traceback
    traceback.print_exc()

import os
os.kill(os.getpid(), 9)

üöÄ Starting training...
Using device: cuda

Creating dataloaders...
Total candidates in file: 754975
Candidates after filtering by available scans: 222653
Positive samples: 455
Negative samples: 222198

After balancing (training):
  Positive samples: 9100
  Negative samples: 11109
train dataset: 20209 samples
Total candidates in file: 754975
Candidates after filtering by available scans: 75063
Positive samples: 120
Negative samples: 74943
val dataset: 75063 samples
Train batches: 5053
Val batches: 18766

Creating model...
Number of trainable parameters: 2,363,935

Setting up loss function...
Using Weighted Cross Entropy Loss
  Class weights: [1.0, 10.0]

Starting training...

Epoch 1/20
Learning rate: 0.01


Epoch 1:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 3777/5053 [4:29:47<1:53:03,  5.32s/it, loss=0.5406]

## 11. Save Checkpoints to Drive

In [None]:
# Create backup directory in Drive
!mkdir -p "/content/drive/MyDrive/LUNA16_checkpoints_FPR"

# Copy checkpoints
!cp -r /content/checkpoints_fpr/* "/content/drive/MyDrive/LUNA16_checkpoints_FPR/"

print("‚úÖ Checkpoints saved to Google Drive!")
!ls -lh "/content/drive/MyDrive/LUNA16_checkpoints_FPR/"

## 12. Load and Evaluate Best Model

In [None]:
import torch
from GCSAM_FPR import MyModel

# Load best model
checkpoint_path = '/content/checkpoints_fpr/best_model.pth'

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)

    model = MyModel(num_classes=2)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    print("‚úÖ Best model loaded!")
    print(f"   Epoch: {checkpoint['epoch']+1}")

    if 'metrics' in checkpoint:
        metrics = checkpoint['metrics']
        print("\nüìä Best Model Metrics:")
        print(f"   Loss: {metrics['loss']:.4f}")
        print(f"   Accuracy: {metrics['accuracy']:.4f}")
        print(f"   Precision: {metrics['precision']:.4f}")
        print(f"   Recall: {metrics['recall']:.4f}")
        print(f"   F1 Score: {metrics['f1']:.4f}")
        print(f"   Specificity: {metrics['specificity']:.4f}")
else:
    print(f"‚ùå Checkpoint not found: {checkpoint_path}")

## 13. Resume Training (if disconnected)

In [None]:
# Resume from checkpoint
checkpoint_path = '/content/checkpoints_fpr/checkpoint_epoch_10.pth'

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)

    # Update start epoch
    import config_fpr as cfg
    cfg.config['start_epoch'] = checkpoint['epoch'] + 1

    print(f"‚úÖ Resuming from epoch {cfg.config['start_epoch']}")

    # Continue training
    from train_fpr import main
    main()
else:
    print(f"‚ùå Checkpoint not found: {checkpoint_path}")