# ReidNet V3 Training - InsightFace ArcFace on Custom Dataset

PyTorch-based ArcFace training with RecordIO format dataset.

**Environment:**
- GPU runtime (A100/V100/T4) with CUDA 12.x
- ~50GB disk space for dataset
- Python 3.8+ (any version)
- Root directory: `/home/ubuntu/`

**Workflow:**
1. Run Step 1 (environment check)
2. Run Step 2 (install dependencies) ‚Üí **Restart kernel**
3. Run Step 2b and continue sequentially

In [None]:
# üîç STEP 1: ENVIRONMENT CHECK
import sys
from pathlib import Path

print("üîç ENVIRONMENT CHECK")
print("=" * 60)
print(f"Python: {sys.version.split()[0]}")
print(f"Platform: {sys.platform}")

# Setup directories
WORKDIR = Path("/home/ubuntu/insightface_training")
CHECKPOINT_DIR = Path("/home/ubuntu/checkpoints/reidnet_v3")

WORKDIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

print(f"\nüìÇ Working directory: {WORKDIR}")
print(f"üíæ Checkpoint directory: {CHECKPOINT_DIR}")

# Check GPU
print("\nüéÆ GPU CHECK")
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv

In [None]:
# üì¶ STEP 2: INSTALL DEPENDENCIES
import sys
import subprocess

print("üì¶ INSTALLING DEPENDENCIES")
print("=" * 60)

# Fix PATH for local binaries
import os
if '/home/ubuntu/.local/bin' not in os.environ['PATH']:
    os.environ['PATH'] = f"/home/ubuntu/.local/bin:{os.environ['PATH']}"

# Ensure pip is available in current Python environment
print("üîß Ensuring pip is available...")
result = subprocess.run([sys.executable, "-m", "pip", "--version"], capture_output=True)
if result.returncode != 0:
    print("‚ö†Ô∏è  pip not found, bootstrapping...")
    subprocess.run([sys.executable, "-m", "ensurepip", "--default-pip"], check=True)
    subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "pip"], check=True)
print("‚úÖ pip ready")

# Core dependencies
!{sys.executable} -m pip install -q numpy boto3 awscli

# PyTorch ecosystem (CUDA 12.1 for CUDA 12.x systems)
!{sys.executable} -m pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Training dependencies (NO mxnet - PyTorch has native RecordIO reader)
!{sys.executable} -m pip install -q tensorboard onnx onnxruntime-gpu easydict opencv-python scikit-image tqdm

print("\n‚úÖ Dependencies installed")
print("\n‚ö†Ô∏è  IMPORTANT: Restart kernel now (Kernel ‚Üí Restart)")
print("    Then skip this cell and continue from Step 3")

In [None]:
# üìã STEP 2b: VERIFY PACKAGES (AFTER KERNEL RESTART)
import sys
import os
from pathlib import Path

# Fix PATH
if '/home/ubuntu/.local/bin' not in os.environ['PATH']:
    os.environ['PATH'] = f"/home/ubuntu/.local/bin:{os.environ['PATH']}"

print("üìã PACKAGE VERSIONS")
print("=" * 60)

import numpy as np
import torch

print(f"Python: {sys.version.split()[0]}")
print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("‚ö†Ô∏è  CUDA not available - training will be very slow!")

# Re-establish paths from Step 1
WORKDIR = Path("/home/ubuntu/insightface_training")
CHECKPOINT_DIR = Path("/home/ubuntu/checkpoints/reidnet_v3")
print(f"\n‚úÖ Environment ready")

In [None]:
# üì• STEP 3: CLONE INSIGHTFACE REPOSITORY
import os

repo_path = WORKDIR / "insightface"
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', '')
GITHUB_USER = os.getenv('GITHUB_USER', 'deanofthewebb')
BRANCH = "main"

if GITHUB_TOKEN:
    REPO_URL = f"https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com/{GITHUB_USER}/insightface.git"
else:
    REPO_URL = f"https://github.com/{GITHUB_USER}/insightface.git"

if not repo_path.exists():
    print(f"üì• Cloning InsightFace repository...")
    !git clone --depth 1 --branch {BRANCH} {REPO_URL} {repo_path}
    print("‚úÖ Repository cloned")
else:
    print("üìÅ Repository exists, pulling latest changes...")
    !cd {repo_path} && git pull origin {BRANCH}
    print("‚úÖ Repository updated")

# Navigate to training directory
training_dir = repo_path / "recognition" / "arcface_torch"
os.chdir(training_dir)
print(f"\nüìç Working in: {os.getcwd()}")

In [None]:
# üîë STEP 4: CONFIGURE AWS CREDENTIALS
import os
import pathlib
import boto3

print("üîë CONFIGURING AWS CREDENTIALS")
print("=" * 60)

# Read credentials from environment
AK = os.getenv("AWS_ACCESS_KEY_ID", "")
SK = os.getenv("AWS_SECRET_ACCESS_KEY", "")
REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-2")

if not AK or not SK:
    raise ValueError("AWS credentials not found. Set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables.")

# Clear stale session tokens (keep credentials)
for k in ["AWS_SESSION_TOKEN", "AWS_SECURITY_TOKEN", "AWS_PROFILE"]:
    os.environ.pop(k, None)

# Write AWS config
aws_dir = pathlib.Path.home() / ".aws"
aws_dir.mkdir(parents=True, exist_ok=True)
(aws_dir / "credentials").write_text(
    f"[default]\naws_access_key_id={AK}\naws_secret_access_key={SK}\n"
)
(aws_dir / "config").write_text(
    f"[default]\nregion={REGION}\noutput=json\n"
)

# Set environment variables
os.environ["AWS_ACCESS_KEY_ID"] = AK
os.environ["AWS_SECRET_ACCESS_KEY"] = SK
os.environ["AWS_DEFAULT_REGION"] = REGION

# Verify credentials
try:
    sts = boto3.client("sts", region_name=REGION)
    identity = sts.get_caller_identity()
    print(f"\n‚úÖ AWS credentials verified")
    print(f"   Account: {identity['Account']}")
    print(f"   User: {identity['Arn']}")
    
    # Check disk space
    print("\nüìä Disk Space:")
    !df -h /home/ubuntu | head -2
except Exception as e:
    print(f"\n‚ùå AWS verification failed: {e}")
    raise

In [None]:
# üì• STEP 5: DOWNLOAD DATASET FROM S3
import subprocess
import boto3

print("üì• DOWNLOADING REIDNET V3 DATASET")
print("=" * 60)

# Dataset configuration
BUCKET = "data-labeling.livereachmedia.com"
PREFIX = "datasets/reidnet_v3/rekognition_set/"
DATASET_DIR = WORKDIR / "datasets" / "reidnet_v3_rec"
DATASET_DIR.mkdir(parents=True, exist_ok=True)

# Required files for RecordIO format (PyTorch reads natively)
required_files = ["train.rec", "train.idx", "property"]

print(f"\nSource: s3://{BUCKET}/{PREFIX}")
print(f"Destination: {DATASET_DIR}\n")

s3 = boto3.client("s3", region_name=REGION)

for filename in required_files:
    s3_key = f"{PREFIX}{filename}"
    local_path = DATASET_DIR / filename
    
    if local_path.exists():
        print(f"‚è≠Ô∏è  {filename} already exists")
        continue
    
    try:
        # Get file size
        head = s3.head_object(Bucket=BUCKET, Key=s3_key)
        size_mb = head["ContentLength"] / 1e6
        print(f"üì¶ Downloading {filename} ({size_mb:.1f} MB)...")
        
        # Download
        s3.download_file(BUCKET, s3_key, str(local_path))
        print(f"‚úÖ {filename} downloaded\n")
    except Exception as e:
        print(f"‚ùå Failed to download {filename}: {e}\n")
        raise

# Read dataset statistics from property file
property_file = DATASET_DIR / "property"
if property_file.exists():
    lines = property_file.read_text().strip().split("\n")
    num_classes = int(lines[0].split(",")[0])
    num_images = int(lines[1])
    
    print("=" * 60)
    print("üìä DATASET STATISTICS")
    print("=" * 60)
    print(f"Identities: {num_classes:,}")
    print(f"Images: {num_images:,}")
    print(f"Avg images/identity: {num_images/num_classes:.1f}")
else:
    raise FileNotFoundError("Dataset property file not found")

In [None]:
# üì• STEP 6: DOWNLOAD PRETRAINED MODEL
print("üì• DOWNLOADING PRETRAINED ONNX MODEL")
print("=" * 60)

pretrained_dir = WORKDIR / "pretrained_models"
pretrained_dir.mkdir(parents=True, exist_ok=True)

# NVR production ONNX backbone (LResNet100E-IR ArcFace)
S3_MODEL_PATH = "s3://data-labeling.livereachmedia.com/datasets/face_rec/nvr.prod.v7.facerec.backbone.onnx"
model_name = "nvr.prod.v7.facerec.backbone.onnx"
local_model = pretrained_dir / model_name

if local_model.exists():
    print(f"\n‚è≠Ô∏è  Model already exists: {model_name}")
    ONNX_BACKBONE = str(local_model)
else:
    try:
        print(f"\nüì¶ Downloading {model_name}...")
        !aws s3 cp {S3_MODEL_PATH} {local_model} --only-show-errors
        
        size_mb = local_model.stat().st_size / 1e6
        print(f"‚úÖ Downloaded ({size_mb:.1f} MB)")
        ONNX_BACKBONE = str(local_model)
    except Exception as e:
        print(f"\n‚ö†Ô∏è  Failed to download ONNX model: {e}")
        print("   Training will start with random initialization")
        ONNX_BACKBONE = None

print(f"\nüìç ONNX backbone: {ONNX_BACKBONE or 'None (random init)'}")

In [None]:
# ‚öôÔ∏è STEP 7: CREATE TRAINING CONFIG
print("‚öôÔ∏è CREATING TRAINING CONFIG")
print("=" * 60)

# Read dataset statistics
property_file = DATASET_DIR / "property"
lines = property_file.read_text().strip().split("\n")
num_classes = int(lines[0].split(",")[0])
num_images = int(lines[1])

print(f"\nDataset: {num_classes:,} identities, {num_images:,} images")

# Create config
config_content = f'''# ReidNet V3 Fine-tuning Configuration
from easydict import EasyDict as edict

config = edict()

# Network architecture
config.network = "r100"  # ResNet100 backbone
config.embedding_size = 512
config.margin_list = (1.0, 0.5, 0.0)  # ArcFace (m, s, a)
config.interclass_filtering_threshold = 0.0

# Output directory
config.output = "{CHECKPOINT_DIR / 'work_dirs'}"
config.resume = False

# Dataset (RecordIO format)
config.rec = "{DATASET_DIR}"
config.num_classes = {num_classes}
config.num_image = {num_images}
config.num_workers = 8
config.dali = False  # Set True if NVIDIA DALI available

# Training hyperparameters
config.batch_size = 128  # Adjust based on GPU: A100=512, V100=128, T4=64
config.lr = 0.01  # Conservative for fine-tuning
config.optimizer = "sgd"
config.momentum = 0.9
config.weight_decay = 5e-4
config.sample_rate = 1.0
config.fp16 = True  # Mixed precision training

# Training schedule
config.num_epoch = 24
config.warmup_epoch = 0  # Skip warmup for fine-tuning

# Logging and checkpointing
config.verbose = 2000
config.frequent = 20
config.save_all_states = True
config.save_interval = 20000
config.gradient_acc = 1

# Evaluation
config.val_targets = []  # Add validation datasets if available

config.seed = 2048
'''

# Write config
config_dir = training_dir / "configs"
config_dir.mkdir(exist_ok=True)
config_file = config_dir / "reidnet_v3_finetune.py"
config_file.write_text(config_content)

print(f"\n‚úÖ Config saved: {config_file}")
print("\nüìã Training Configuration:")
print(f"   Identities: {num_classes:,}")
print(f"   Images: {num_images:,}")
print(f"   Batch size: 128 per GPU")
print(f"   Epochs: 24")
print(f"   Mixed precision: Enabled")
print(f"   Pretrained: {'Yes' if PRETRAINED_MODEL else 'No (from scratch)'}")

In [None]:
# üöÄ STEP 8: START TRAINING
import os

print("üöÄ STARTING TRAINING")
print("=" * 60)

os.chdir(training_dir)

# Build command
cmd = ["python", "train_v3.py", "configs/reidnet_v3_finetune.py"]

# Add ONNX backbone if available
if ONNX_BACKBONE and Path(ONNX_BACKBONE).exists():
    cmd.extend(["--onnx-backbone", ONNX_BACKBONE])
    print(f"\nüì¶ Using ONNX backbone:\n   {ONNX_BACKBONE}")
    print(f"   Architecture: LResNet100E-IR (512-D embeddings)")
else:
    print("\nüî® Training from random initialization (no pretrained backbone)")

print(f"\nüíª Command: {' '.join(cmd)}")
print("\n" + "=" * 60)
print("üéØ TRAINING STARTED")
print("=" * 60)
print(f"\nüìÇ Checkpoints: {CHECKPOINT_DIR / 'work_dirs'}")
print("\nTo monitor: Open TensorBoard in next cell")
print("To stop: Runtime ‚Üí Interrupt\n")
print("=" * 60 + "\n")

# Execute training
!{' '.join(cmd)}

In [None]:
# üìä STEP 9: MONITOR TRAINING (RUN IN PARALLEL)
print("üìä LAUNCHING TENSORBOARD")
print("=" * 60)

%load_ext tensorboard
tensorboard_dir = CHECKPOINT_DIR / "work_dirs" / "logs"
print(f"\nüìà Log directory: {tensorboard_dir}\n")

%tensorboard --logdir {tensorboard_dir}

In [None]:
# üì§ STEP 10: EXPORT CHECKPOINTS TO S3 (AFTER TRAINING)
print("üì§ EXPORTING CHECKPOINTS TO S3")
print("=" * 60)

work_dir = CHECKPOINT_DIR / "work_dirs"
checkpoints = sorted(work_dir.glob("**/backbone.pth"), key=lambda p: p.stat().st_mtime)

if checkpoints:
    print(f"\n‚úÖ Found {len(checkpoints)} checkpoint(s)\n")
    
    # Upload all checkpoints to S3
    s3_base = "s3://data-labeling.livereachmedia.com/models/reidnet_v3/checkpoints/"
    
    for ckpt in checkpoints:
        size_mb = ckpt.stat().st_size / 1e6
        s3_path = f"{s3_base}{ckpt.parent.name}_{ckpt.name}"
        
        print(f"üì¶ Uploading {ckpt.name} ({size_mb:.1f} MB)...")
        !aws s3 cp {ckpt} {s3_path} --only-show-errors
        print(f"‚úÖ Uploaded to {s3_path}\n")
    
    # Upload latest checkpoint with special name
    latest = checkpoints[-1]
    s3_latest = f"{s3_base}reidnet_v3_latest.pth"
    print(f"üì¶ Uploading latest checkpoint as reidnet_v3_latest.pth...")
    !aws s3 cp {latest} {s3_latest} --only-show-errors
    print(f"‚úÖ Latest checkpoint: {s3_latest}")
    
    print("\n" + "=" * 60)
    print("‚úÖ ALL CHECKPOINTS UPLOADED TO S3")
    print("=" * 60)
else:
    print("\n‚ö†Ô∏è No checkpoints found. Training may still be in progress.")
    print(f"   Check: {work_dir}")

## üîß Troubleshooting

### Out of Memory (OOM)
Reduce `batch_size` in config:
- A100 (80GB): 512
- A100 (40GB): 256
- V100 (32GB): 128
- V100 (16GB): 64
- T4 (16GB): 32-64

Or disable mixed precision: `config.fp16 = False`

### Slow Training
- Increase `num_workers` (try 8-16)
- Enable DALI if available: `config.dali = True`
- Check if dataset is on SSD

### Dataset Format
RecordIO format requires:
- `train.rec` - binary record file
- `train.idx` - index file
- `property` - metadata (2 lines: num_classes,112,112 and num_images)

**Note**: PyTorch ArcFace has native RecordIO reader - no MXNet required!

### CUDA Errors
```bash
# Check CUDA installation
!nvidia-smi
!nvcc --version

# Reinstall PyTorch with correct CUDA version
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
```

## üìñ References

- [InsightFace GitHub](https://github.com/deepinsight/insightface)
- [ArcFace Paper](https://arxiv.org/abs/1801.07698)
- [PyTorch ArcFace Docs](https://github.com/deepinsight/insightface/tree/master/recognition/arcface_torch)