In [1]:
import torch
import torchvision
import torchvision.transforms as transforms

# Define transforms for data augmentation and normalization
transform = transforms.Compose([transforms.Resize((224, 224)), 
                                transforms.ToTensor(), 
                                transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
                               ])

# Load CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
num_clusters=10

# Create DataLoader for batch processing
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1000, shuffle=False, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=1000, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 48839288.92it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [2]:
import torch.nn as nn
import torchvision.models as models
import numpy as np
from tqdm import tqdm

# Load pretrained ResNet model and modify it to act as a feature extractor
resnet50 = models.resnet50(pretrained=True)
resnet50 = nn.Sequential(*list(resnet50.children())[:-1])  # Remove the final classification layer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet50 = resnet50.to(device)

# Function to extract features
def extract_features(dataloader, dataset_name='Dataset'):
    resnet50.eval()  # Set model to evaluation mode
    features = []
    labels = []
    with torch.no_grad():
        for inputs, targets in tqdm(dataloader, desc=f'Extracting features from {dataset_name}', unit='batch', total=len(dataloader)):
            outputs = resnet50(inputs.to(device)).squeeze()
            features.append(outputs.cpu().numpy())
            labels.append(targets.numpy())
    return np.vstack(features), np.hstack(labels)



# Extract features from train and test set
train_features, train_labels = extract_features(trainloader, 'Train Set')
test_features, test_labels = extract_features(testloader, 'Test Set')

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 202MB/s]
Extracting features from Train Set: 100%|██████████| 50/50 [02:24<00:00,  2.88s/batch]
Extracting features from Test Set: 100%|██████████| 10/10 [00:31<00:00,  3.20s/batch]


In [3]:
from sklearn.cluster import KMeans
import numpy as np

# Apply K-means clustering to training set
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

train_clusters = kmeans.fit_predict(train_features)

# Assign each test image to its nearest cluster
test_clusters = kmeans.predict(test_features)



In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_matches(train_features, test_feature, cluster_indices, k=50):
    # Get the features of images belonging to the same cluster
    cluster_features = train_features[cluster_indices]
    
    # Compute cosine similarity between the test image and cluster images
    similarities = cosine_similarity(test_feature.reshape(1, -1), cluster_features).flatten()
    
    # Get the top k most similar images
    top_k_indices = np.argsort(similarities)[::-1][:k]
    return cluster_indices[top_k_indices]

In [5]:
# For each test image, find the top 50 matches
top_k_matches = []
for i, test_feature in tqdm(enumerate(test_features), desc='Finding Top-K Matches', unit='image', total=len(test_features)):
    # Find the training images belonging to the same cluster
    cluster_indices = np.where(train_clusters == test_clusters[i])[0]
    
    # Get the top 50 matches based on cosine similarity
    top_k_matches.append(get_top_k_matches(train_features, test_feature, cluster_indices))

Finding Top-K Matches: 100%|██████████| 10000/10000 [09:54<00:00, 16.83image/s]


In [6]:
def precision_at_k(true_label, top_k_labels, k):
    top_k = top_k_labels[:k]
    correct = np.sum(top_k == true_label)
    return correct / k

def mean_average_precision(true_label, top_k_labels):
    # Calculate precision at each rank and then compute average precision
    precisions = []
    correct = 0
    for i, label in enumerate(top_k_labels):
        if label == true_label:
            correct += 1
            precisions.append(correct / (i + 1))
    return np.mean(precisions) if precisions else 0


In [7]:
# Evaluate for all test images
precision_10 = []
precision_50 = []
mean_ap = []
for i, matches in tqdm(enumerate(top_k_matches), desc='Evaluating Metrics', unit='image', total=len(top_k_matches)):
    true_label = test_labels[i]
    matched_labels = train_labels[matches]
    
    precision_10.append(precision_at_k(true_label, matched_labels, 10))
    precision_50.append(precision_at_k(true_label, matched_labels, 50))
    mean_ap.append(mean_average_precision(true_label, matched_labels))

# Report final metrics
print(f'Mean Precision@10: {np.mean(precision_10):.4f}')
print(f'Mean Precision@50: {np.mean(precision_50):.4f}')
print(f'Mean Average Precision: {np.mean(mean_ap):.4f}')

Evaluating Metrics: 100%|██████████| 10000/10000 [00:00<00:00, 15817.77image/s]

Mean Precision@10: 0.7966
Mean Precision@50: 0.7629
Mean Average Precision: 0.8051





In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=50)

train_features_pca = pca.fit_transform(train_features)
test_features_pca = pca.transform(test_features)

In [9]:
kmeans_pca = KMeans(n_clusters=num_clusters, random_state=42)

train_clusters_pca = kmeans_pca.fit_predict(train_features_pca)

# Assign each test image to its nearest cluster
test_clusters_pca = kmeans_pca.predict(test_features_pca)



In [10]:
# For each test image, find the top 50 matches
top_k_matches_pca = []
for i, test_feature_pca in tqdm(enumerate(test_features_pca), desc='Finding Top-K Matches (PCA)', unit='image', total=len(test_features_pca)):
    # Find the training images belonging to the same cluster
    cluster_indices = np.where(train_clusters_pca == test_clusters_pca[i])[0]
    
    # Get the top 50 matches based on cosine similarity
    top_k_matches_pca.append(get_top_k_matches(train_features_pca, test_feature_pca, cluster_indices))

Finding Top-K Matches (PCA): 100%|██████████| 10000/10000 [00:30<00:00, 323.01image/s]


In [11]:
# Evaluate for all test images
precision_10_pca = []
precision_50_pca = []
mean_ap_pca = []
for i, matches in tqdm(enumerate(top_k_matches_pca), desc='Evaluating Metrics (PCA)', unit='image', total=len(top_k_matches_pca)):
    true_label = test_labels[i]
    matched_labels = train_labels[matches]
    
    precision_10.append(precision_at_k(true_label, matched_labels, 10))
    precision_50.append(precision_at_k(true_label, matched_labels, 50))
    mean_ap.append(mean_average_precision(true_label, matched_labels))

# Report final metrics
print(f'Mean Precision@10: {np.mean(precision_10):.4f}')
print(f'Mean Precision@50: {np.mean(precision_50):.4f}')
print(f'Mean Average Precision: {np.mean(mean_ap):.4f}')

Evaluating Metrics (PCA): 100%|██████████| 10000/10000 [00:00<00:00, 15804.93image/s]

Mean Precision@10: 0.7926
Mean Precision@50: 0.7608
Mean Average Precision: 0.8020



