In [1]:
import os
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.datasets import CocoDetection
from torch.utils.data import DataLoader, random_split
from PIL import Image
import time
import json



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
session = boto3.session.Session() 
sm_session = sagemaker.Session(session)
role = get_execution_role()

bucket = sm_session.default_bucket()
prefix = "mscoco"
s3_data_path = f"s3://{bucket}/{prefix}"

print(f"SageMaker Role: {role}")
print(f"S3 Bucket: {bucket}")
print(f"S3 Prefix: {prefix}")

SageMaker Role: arn:aws:iam::438465157691:role/service-role/AmazonSageMaker-ExecutionRole-20250306T154221
S3 Bucket: sagemaker-ap-southeast-1-438465157691
S3 Prefix: mscoco


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
class CocoClassificationDataset(CocoDetection):
    def __init__(self, root, annFile, transform=None, max_samples=None):
        super(CocoClassificationDataset, self).__init__(root, annFile, transform)
        # Get all categories and create a mapping
        categories = self.coco.loadCats(self.coco.getCatIds())
        self.category_to_idx = {cat['id']: i for i, cat in enumerate(categories)}
        self.idx_to_category = {i: cat for i, cat in enumerate(categories)}
        self.num_classes = len(categories)
        
        # Limit number of samples if specified (for quick testing)
        if max_samples is not None and max_samples < len(self.ids):
            self.ids = self.ids[:max_samples]
        
    def __getitem__(self, index):
        img, anns = super(CocoClassificationDataset, self).__getitem__(index)
        # Use the first annotation's category as the target
        if len(anns) > 0:
            target = self.category_to_idx[anns[0]['category_id']]
        else:
            # Default to background class if no annotations
            target = -1
        return img, target
    
    def get_category_name(self, idx):
        """Get category name from index"""
        cat_id = next((k for k, v in self.category_to_idx.items() if v == idx), None)
        if cat_id:
            return self.coco.loadCats([cat_id])[0]['name']
        return "Unknown"

In [6]:
!pip install pycocotools
!mkdir -p coco_mini/val2017
!mkdir -p coco_mini/annotations

Collecting pycocotools
  Downloading pycocotools-2.0.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocotools-2.0.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (458 kB)
Installing collected packages: pycocotools
Successfully installed pycocotools-2.0.8


In [8]:
from torchvision.datasets import CocoDetection
import torchvision.transforms as transforms
import json

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [9]:
mini_images = ['000000000139.jpg', '000000000285.jpg', '000000000632.jpg', '000000000724.jpg']
mini_annotations = {
    "images": [
        {"id": 139, "file_name": "000000000139.jpg", "height": 480, "width": 640},
        {"id": 285, "file_name": "000000000285.jpg", "height": 480, "width": 640},
        {"id": 632, "file_name": "000000000632.jpg", "height": 480, "width": 640},
        {"id": 724, "file_name": "000000000724.jpg", "height": 480, "width": 640}
    ],
    "annotations": [
        {"id": 1, "image_id": 139, "category_id": 1, "bbox": [0, 0, 100, 100]},
        {"id": 2, "image_id": 285, "category_id": 2, "bbox": [0, 0, 100, 100]},
        {"id": 3, "image_id": 632, "category_id": 3, "bbox": [0, 0, 100, 100]},
        {"id": 4, "image_id": 724, "category_id": 4, "bbox": [0, 0, 100, 100]}
    ],
    "categories": [
        {"id": 1, "name": "person", "supercategory": "person"},
        {"id": 2, "name": "dog", "supercategory": "animal"},
        {"id": 3, "name": "cat", "supercategory": "animal"},
        {"id": 4, "name": "car", "supercategory": "vehicle"}
    ]
}

with open('coco_mini/annotations/instances_val2017.json', 'w') as f:
    json.dump(mini_annotations, f)

In [None]:
# Real dataset
# !aws s3 cp s3://{bucket}/{prefix}/val2017/ coco_mini/val2017/ --recursive
# !aws s3 cp s3://{bucket}/{prefix}/annotations/instances_val2017.json coco_mini/annotations/


In [10]:
coco_path = 'coco_mini/val2017/'
annotation_path = 'coco_mini/annotations/instances_val2017.json'
# Create a dataset object 
coco_dataset = CocoClassificationDataset(
    root=coco_path,
    annFile=annotation_path,
    transform=None,  # No transform for visualization
    max_samples=100  # Limit samples for exploration
)

print(f"Number of images: {len(coco_dataset)}")
print(f"Number of classes: {coco_dataset.num_classes}")

# Display a few random samples
def show_samples(dataset, num_samples=4):
    plt.figure(figsize=(15, 4))
    for i in range(num_samples):
        # Get a random sample
        idx = np.random.randint(0, len(dataset))
        img, target = dataset[idx]
        
        # Skip images without valid targets
        if target == -1:
            continue
            
        # Get the category name
        category = dataset.get_category_name(target)
        
        plt.subplot(1, num_samples, i+1)
        plt.imshow(img)
        plt.title(f"Class: {category}")
        plt.axis('off')
    plt.tight_layout()
    plt.show()

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Number of images: 4
Number of classes: 4


In [11]:
model = models.resnet50(pretrained=True)
print("ResNet-50 Architecture:")
print(model)

print("\nOriginal final layer:")
print(model.fc)

num_classes = coco_dataset.num_classes
model.fc = nn.Linear(model.fc.in_features, num_classes)
print("\nModified final layer:")
print(model.fc)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/sagemaker-user/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 102MB/s] 

ResNet-50 Architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_




In [12]:
def train_model_mini(model, train_loader, val_loader, criterion, optimizer, num_epochs=2):
    """A mini training loop to test in the notebook"""
    model.to(device)
    
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        print("-" * 10)
        
        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0
        
        # Only process a few batches for quick testing
        for i, (inputs, labels) in enumerate(train_loader):
            # Skip batches with no valid labels
            if -1 in labels:
                continue
                
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            with torch.set_grad_enabled(True):
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                
                # Backward pass
                loss.backward()
                optimizer.step()
            
            # Statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            
            # Break after a few batches for quick testing
            if i >= 3:
                break
        
        # Calculate metrics
        if i > 0:  # Avoid division by zero
            epoch_loss = running_loss / ((i+1) * inputs.size(0))
            epoch_acc = running_corrects.double() / ((i+1) * inputs.size(0))
            print(f"Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")
        
        # Validation phase
        model.eval()
        running_loss = 0.0
        running_corrects = 0
        
        for i, (inputs, labels) in enumerate(val_loader):
            # Skip batches with no valid labels
            if -1 in labels:
                continue
                
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Forward pass
            with torch.no_grad():
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
            
            # Statistics
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            
            # Break after a few batches for quick testing
            if i >= 2:
                break
        
        # Calculate metrics
        if i > 0:  # Avoid division by zero
            epoch_loss = running_loss / ((i+1) * inputs.size(0))
            epoch_acc = running_corrects.double() / ((i+1) * inputs.size(0))
            print(f"Val Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")
    
    return model

In [13]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

full_dataset = CocoClassificationDataset(
    root=coco_path,
    annFile=annotation_path,
    transform=transform,
    max_samples=100  # Limit samples for testing
)

# Split into train/val
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2)

# Initialize model with pre-trained weights
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, full_dataset.num_classes)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!




In [14]:
trained_model = train_model_mini(
    model, 
    train_loader, 
    val_loader, 
    criterion, 
    optimizer, 
    num_epochs=2
)


Epoch 1/2
----------


In [15]:
!mkdir -p code

In [16]:
# Write the full training script
with open('code/train.py', 'w') as f:
    f.write('''
import os
import argparse
import json
import logging
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.datasets import CocoDetection
from torch.utils.data import DataLoader, random_split

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def parse_args():
    parser = argparse.ArgumentParser()
    
    # Data, model, and output directories
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--val', type=str, default=os.environ.get('SM_CHANNEL_VAL'))
    parser.add_argument('--annotations', type=str, default=os.environ.get('SM_CHANNEL_ANNOTATIONS'))
    
    # Training hyperparameters
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--learning-rate', type=float, default=0.001)
    parser.add_argument('--use-cuda', type=bool, default=True)
    
    return parser.parse_args()

class CocoClassificationDataset(CocoDetection):
    def __init__(self, root, annFile, transform=None):
        super(CocoClassificationDataset, self).__init__(root, annFile, transform)
        # Map category_id to an index for classification
        categories = self.coco.loadCats(self.coco.getCatIds())
        self.category_to_idx = {cat['id']: i for i, cat in enumerate(categories)}
        self.num_classes = len(categories)
        
    def __getitem__(self, index):
        img, anns = super(CocoClassificationDataset, self).__getitem__(index)
        # Use the first annotation's category as the target
        if len(anns) > 0:
            target = self.category_to_idx[anns[0]['category_id']]
        else:
            # Default to background class if no annotations
            target = -1
        return img, target

def create_data_loaders(data_dir, annotations_dir, batch_size):
    # Define data transforms
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Load the COCO dataset
    train_annotation_file = os.path.join(annotations_dir, 'instances_train2017.json')
    val_annotation_file = os.path.join(annotations_dir, 'instances_val2017.json')
    
    train_dir = os.path.join(data_dir, 'train2017')
    val_dir = os.path.join(data_dir, 'val2017')
    
    train_dataset = CocoClassificationDataset(root=train_dir, 
                                             annFile=train_annotation_file, 
                                             transform=transform)
    
    val_dataset = CocoClassificationDataset(root=val_dir, 
                                           annFile=val_annotation_file, 
                                           transform=transform)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    
    return train_loader, val_loader, train_dataset.num_classes

def train(model, train_loader, val_loader, optimizer, criterion, device, epochs, model_dir):
    best_val_acc = 0.0
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for i, (inputs, targets) in enumerate(train_loader):
            # Skip batches with no valid targets
            if -1 in targets:
                continue
                
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            # Statistics
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
            if i % 100 == 99:  # Print every 100 mini-batches
                logger.info(f'Epoch: {epoch+1}, Batch: {i+1}, Loss: {running_loss/100:.3f}, Acc: {100.*correct/total:.3f}%')
                running_loss = 0.0
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for inputs, targets in val_loader:
                # Skip batches with no valid targets
                if -1 in targets:
                    continue
                    
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += targets.size(0)
                val_correct += predicted.eq(targets).sum().item()
        
        val_acc = 100. * val_correct / val_total if val_total > 0 else 0
        logger.info(f'Epoch: {epoch+1}, Val Loss: {val_loss/len(val_loader):.3f}, Val Acc: {val_acc:.3f}%')
        
        # Save the model if validation accuracy improved
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), os.path.join(model_dir, 'best_model.pth'))
            logger.info(f'New best model saved with validation accuracy: {val_acc:.3f}%')
    
    # Save the final model
    torch.save(model.state_dict(), os.path.join(model_dir, 'model.pth'))
    logger.info(f'Training completed. Best validation accuracy: {best_val_acc:.3f}%')

def main():
    args = parse_args()
    
    # Set device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() and args.use_cuda else "cpu")
    logger.info(f"Using device: {device}")
    
    # Create data loaders
    train_loader, val_loader, num_classes = create_data_loaders(
        args.train, args.annotations, args.batch_size)
    
    logger.info(f"Number of classes: {num_classes}")
    
    # Load pre-trained ResNet-50 model
    model = models.resnet50(pretrained=True)
    
    # Modify the final layer for our number of classes
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_classes)
    model = model.to(device)
    
    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9)
    
    # Train the model
    train(model, train_loader, val_loader, optimizer, criterion, device, args.epochs, args.model_dir)
    
    logger.info("Training completed!")

if __name__ == '__main__':
    main()
''')

In [None]:
# Configure the training job
estimator = Pytorch(entry_point='train.py',
                    source_dir='code/,
                    role=role,
                    framework_version='1.8.1',
                    py_version='py3',
                    instance_count=1,
                    instance_type='ml.p3.2xlarge',
                    hyperparameters={'batch-size': 32, 'epochs': 10, 'learning-rate': 0.001})

# Define data channels for the full MS COCO dataset in S3
# Note: You should have uploaded the COCO dataset to S3 already
data_channels = {
    'train': f'{s3_data_path}/train2017',
    'val': f'{s3_data_path}/val2017',
    'annotations': f'{s3_data_path}/annotations'
}
estimator.fit(data_channels, wait=False)
print(f"Training job started: {estimator.latest_training_job.name}")

In [None]:
# Evaluate

import sagemaker
from sagemaker.model import Model
from sagemaker.pytorch import PyTorchModel

# Assuming your model is already trained and stored in S3
model_data = 's3://your-bucket/path/to/model.tar.gz'


sagemaker_model =  PyTorchModel(
    model_data=model_data,
    role=sagemaker.get_execution_role(),
    framework_version='1.8.0',
    py_version='py3',
    entry_point='inference.py' # Your inference script
)

transformer = sagemaker_model.transformer(
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    output_path='s3://your-bucket/evaluation-output/'
)

transformer.transform('s3://your-bucket/coco-validation-data/')

In [None]:
from sagemaker.model_monitor import ModelQualityMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

dataset_format = DatasetFormat.csv()
validation_dataset = 's3://your-bucket/path/to/coco-validation-data/'
validation_output = 's3://your-bucket/evaluation-results/'

# Create the model quality monitor
model_quality_monitor = ModelQualityMonitor(
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600
)

# For object detection task
model_quality_monitor.suggest_baseline(
    job_name='resnet50-coco-evaluation',
    baseline_dataset=validation_dataset,
    dataset_format=dataset_formata,
    output_s3_uri=validation_output,
    problem_type="ObjectDetection",
    inference_attribute='predictions',
    ground_truth_attribute='annotations',
    publish_cloudwatch_metrics=True,
)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput 
processor.run(code='evaluate_coco.py', inputs=[        ProcessingInput(
            source=transformer.output_path,
            destination='/opt/ml/processing/model-output'
        ),
        ProcessingInput(
            source='s3://your-bucket/coco-ground-truth/',
            destination='/opt/ml/processing/ground-truth'
        )],
              outputs=[
        ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
            destination='s3://your-bucket/coco-evaluation-results/'
        )
    ])