In [1]:
!ls

Untitled.ipynb	 __MACOSX	      onstart.sh
Untitled1.ipynb  msmt17_combined      ports.log
Untitled2.ipynb  msmt17_combined.zip  tensorflow-tutorials


In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR, CosineAnnealingWarmRestarts
from torch.utils.data import DataLoader, Dataset
from transformers import ViTModel, ViTFeatureExtractor
from torchvision import transforms
from PIL import Image
import random
import numpy as np
from torch.nn import TripletMarginLoss
from torch.cuda.amp import autocast, GradScaler  # For mixed-precision training
import time
from tqdm import tqdm

# Parameters
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 30
EMBEDDING_DIM = 512
INITIAL_LR = 1e-4
MARGIN = 0.3  # Triplet loss margin
DROPOUT_PROB = 0.3
WEIGHT_DECAY = 1e-5

# Ensure reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Check for CUDA availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
else:
    raise RuntimeError("This script requires a CUDA-enabled GPU to run.")

# Preprocessing
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

class TripletDataset(Dataset):
    def __init__(self, image_paths, labels):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transforms.Compose([
            transforms.Resize((IMG_SIZE, IMG_SIZE)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomApply([transforms.ColorJitter(0.3, 0.3, 0.3, 0.1)], p=0.5),
            transforms.RandomApply([transforms.GaussianBlur(3)], p=0.2),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])
        self.label_to_indices = {label: np.where(np.array(labels) == label)[0].tolist() for label in set(labels)}

    def __getitem__(self, index):
        anchor_path = self.image_paths[index]
        anchor_label = self.labels[index]

        # Positive and negative mining logic
        positive_index = random.choice(self.label_to_indices[anchor_label])
        while positive_index == index:
            positive_index = random.choice(self.label_to_indices[anchor_label])
        positive_path = self.image_paths[positive_index]

        negative_label = random.choice(list(set(self.labels) - {anchor_label}))
        negative_index = random.choice(self.label_to_indices[negative_label])
        negative_path = self.image_paths[negative_index]

        anchor = self.transform(Image.open(anchor_path).convert('RGB'))
        positive = self.transform(Image.open(positive_path).convert('RGB'))
        negative = self.transform(Image.open(negative_path).convert('RGB'))

        return anchor, positive, negative, anchor_label

    def __len__(self):
        return len(self.image_paths)

# Model definition with dropout
class PersonReIDTransformer(nn.Module):
    def __init__(self, embedding_dim=512, dropout_prob=0.3):
        super(PersonReIDTransformer, self).__init__()
        self.backbone = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.embedding = nn.Sequential(
            nn.Linear(self.backbone.config.hidden_size, embedding_dim),
            nn.Dropout(p=dropout_prob)
        )

    def forward(self, x):
        outputs = self.backbone(x).last_hidden_state[:, 0]
        embeddings = self.embedding(outputs)
        return embeddings

# Initialize model
model = PersonReIDTransformer(dropout_prob=DROPOUT_PROB).to(device)

# Unfreeze layers for fine-tuning
def unfreeze_layers(model, percentage_unfrozen):
    total_layers = len(list(model.backbone.parameters()))
    unfreeze_count = int(total_layers * percentage_unfrozen)
    for i, param in enumerate(model.backbone.parameters()):
        if i < unfreeze_count:
            param.requires_grad = True
        else:
            param.requires_grad = False
    print(f"Unfroze {unfreeze_count}/{total_layers} layers.")

unfreeze_layers(model, 0.15)

# Mixed precision
scaler = GradScaler()

# Triplet Loss function
criterion = TripletMarginLoss(margin=MARGIN)

# Optimizer and learning rate scheduler
optimizer = optim.Adam(model.parameters(), lr=INITIAL_LR, weight_decay=WEIGHT_DECAY)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2, eta_min=1e-6)

# Load the dataset paths and labels
def get_image_paths_and_labels(root_dir):
    image_paths = []
    labels = []
    for person_id in os.listdir(root_dir):
        person_dir = os.path.join(root_dir, person_id)
        if os.path.isdir(person_dir):
            for image_name in os.listdir(person_dir):
                if image_name.endswith('.jpg'):
                    image_paths.append(os.path.join(person_dir, image_name))
                    labels.append(int(person_id))
    return image_paths, labels

# Load data
train_paths, train_labels = get_image_paths_and_labels("./msmt17_combined/train")
dataset = TripletDataset(train_paths, train_labels)
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

# Training loop with mixed precision
model.train()
total_batches = len(data_loader)
for epoch in range(EPOCHS):
    epoch_loss = 0
    start_time = time.time()
    
    progress_bar = tqdm(data_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
    
    for batch_idx, batch in enumerate(progress_bar):
        anchor, positive, negative, labels = batch
        anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)

        optimizer.zero_grad()

        # Mixed precision training
        with autocast():
            anchor_embedding = model(anchor)
            positive_embedding = model(positive)
            negative_embedding = model(negative)

            loss = criterion(anchor_embedding, positive_embedding, negative_embedding)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()

        # Update progress bar
        progress_bar.set_postfix({'Loss': f"{loss.item():.4f}"})

    scheduler.step()
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{EPOCHS} completed in {epoch_time:.2f} seconds")
    print(f"Epoch {epoch+1}/{EPOCHS}, Average Loss: {epoch_loss / total_batches:.4f}")

    # Optional: Save model checkpoint after each epoch
    torch.save(model.state_dict(), f'person_reid_model_vit_msmt17_epoch_{epoch+1}.pth')

print("Training completed!")
torch.save(model.state_dict(), 'person_reid_model_vit_msmt17_final.pth')


2024-10-14 00:32:26.241215: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-14 00:32:26.255235: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-14 00:32:26.270369: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-14 00:32:26.275295: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-14 00:32:26.287738: I tensorflow/core/platform/cpu_feature_guar

Using CUDA device: NVIDIA H100 NVL




Unfroze 30/200 layers.


  scaler = GradScaler()
  with autocast():
                                                                            

Epoch 1/30 completed in 948.80 seconds
Epoch 1/30, Average Loss: 0.0681


                                                                            

Epoch 2/30 completed in 974.90 seconds
Epoch 2/30, Average Loss: 0.0334


                                                                            

Epoch 3/30 completed in 981.13 seconds
Epoch 3/30, Average Loss: 0.0271


                                                                            

Epoch 4/30 completed in 979.68 seconds
Epoch 4/30, Average Loss: 0.0230


                                                                            

Epoch 5/30 completed in 968.41 seconds
Epoch 5/30, Average Loss: 0.0202


                                                                            

Epoch 6/30 completed in 956.23 seconds
Epoch 6/30, Average Loss: 0.0273


                                                                            

Epoch 7/30 completed in 989.92 seconds
Epoch 7/30, Average Loss: 0.0259


                                                                            

Epoch 8/30 completed in 980.24 seconds
Epoch 8/30, Average Loss: 0.0238


                                                                            

Epoch 9/30 completed in 985.79 seconds
Epoch 9/30, Average Loss: 0.0223


                                                                             

Epoch 10/30 completed in 979.88 seconds
Epoch 10/30, Average Loss: 0.0204


                                                                             

Epoch 11/30 completed in 965.45 seconds
Epoch 11/30, Average Loss: 0.0190


                                                                             

Epoch 12/30 completed in 970.64 seconds
Epoch 12/30, Average Loss: 0.0169


                                                                             

Epoch 13/30 completed in 950.57 seconds
Epoch 13/30, Average Loss: 0.0158


                                                                             

Epoch 14/30 completed in 982.05 seconds
Epoch 14/30, Average Loss: 0.0147


                                                                             

Epoch 15/30 completed in 970.20 seconds
Epoch 15/30, Average Loss: 0.0137


                                                                             

Epoch 16/30 completed in 921.61 seconds
Epoch 16/30, Average Loss: 0.0225


                                                                             

Epoch 17/30 completed in 926.76 seconds
Epoch 17/30, Average Loss: 0.0225


                                                                             

Epoch 18/30 completed in 966.09 seconds
Epoch 18/30, Average Loss: 0.0222


                                                                             

Epoch 19/30 completed in 969.09 seconds
Epoch 19/30, Average Loss: 0.0222


                                                                             

Epoch 20/30 completed in 956.18 seconds
Epoch 20/30, Average Loss: 0.0216


                                                                             

Epoch 21/30 completed in 991.70 seconds
Epoch 21/30, Average Loss: 0.0206


                                                                             

Epoch 22/30 completed in 999.11 seconds
Epoch 22/30, Average Loss: 0.0206


                                                                             

Epoch 23/30 completed in 974.53 seconds
Epoch 23/30, Average Loss: 0.0193


                                                                             

Epoch 24/30 completed in 942.98 seconds
Epoch 24/30, Average Loss: 0.0191


                                                                             

Epoch 25/30 completed in 991.31 seconds
Epoch 25/30, Average Loss: 0.0177


                                                                             

Epoch 26/30 completed in 980.22 seconds
Epoch 26/30, Average Loss: 0.0170


                                                                             

Epoch 27/30 completed in 977.25 seconds
Epoch 27/30, Average Loss: 0.0159


                                                                             

Epoch 28/30 completed in 977.12 seconds
Epoch 28/30, Average Loss: 0.0155


                                                                             

Epoch 29/30 completed in 971.22 seconds
Epoch 29/30, Average Loss: 0.0135


                                                                             

Epoch 30/30 completed in 774.02 seconds
Epoch 30/30, Average Loss: 0.0135
Training completed!
