# **ResNet Digit Classifier**
---

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/itü/projects/pattern

/content/drive/MyDrive/itü/projects/pattern


In [None]:
!ls cifar-10-batches-py/

batches.meta  data_batch_2  data_batch_4  readme.html
data_batch_1  data_batch_3  data_batch_5  test_batch


In [None]:
import pickle

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

data_batch_1 = unpickle('cifar-10-batches-py/data_batch_1')
data_batch_2 = unpickle('cifar-10-batches-py/data_batch_2')
data_batch_3 = unpickle('cifar-10-batches-py/data_batch_3')
data_batch_4 = unpickle('cifar-10-batches-py/data_batch_4')
data_batch_5 = unpickle('cifar-10-batches-py/data_batch_5')
test_batch = unpickle('cifar-10-batches-py/test_batch')
meta = unpickle('cifar-10-batches-py/batches.meta')

print("Data Batch 1 keys:", data_batch_1.keys())
print("Meta keys:", meta.keys())

Data Batch 1 keys: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
Meta keys: dict_keys([b'num_cases_per_batch', b'label_names', b'num_vis'])


In [None]:
!pwd

/content/drive/MyDrive/itü/projects/pattern


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, Subset, DataLoader
from tqdm import tqdm
import numpy as np
from sklearn.cluster import KMeans
import pickle
import torch.nn.functional as F

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def load_cifar10_data(data_dir):
    train_data, train_labels = [], []
    for i in range(1, 6):
        batch = unpickle(f'{data_dir}/data_batch_{i}')
        train_data.append(batch[b'data'])
        train_labels.extend(batch[b'labels'])
    X_train = np.vstack(train_data).reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
    y_train = np.array(train_labels)
    test_batch_data = unpickle(f'{data_dir}/test_batch')
    X_test = test_batch_data[b'data'].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
    y_test = np.array(test_batch_data[b'labels'])
    return (X_train, y_train), (X_test, y_test)

class Cifar10Raw(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image, label = self.images[idx], self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

class CIFAR10Distiller_Coreset:
    def __init__(self, model, full_dataset, device):
        self.model = model.to(device)
        self.full_dataset = full_dataset
        self.device = device
        self.feature_extractor = nn.Sequential(*list(model.children())[:-1])
        self.feature_extractor.eval()

    @torch.no_grad()
    def get_features(self, batch_size=256):
        temp_loader = DataLoader(self.full_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
        all_features, all_labels = [], []
        prog = tqdm(temp_loader, desc="Extracting features")
        for images, labels in prog:
            images = images.to(self.device)
            features = self.feature_extractor(images).view(images.size(0), -1)
            all_features.append(features.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
        return np.concatenate(all_features), np.concatenate(all_labels)

    def create_distilled_dataset(self, images_per_class=10):
        features, labels = self.get_features()
        num_classes = len(np.unique(labels))
        distilled_indices = []
        for class_id in tqdm(range(num_classes), desc="Finding prototypes per class"):
            indices_in_class = np.where(labels == class_id)[0]
            features_in_class = features[indices_in_class]
            kmeans = KMeans(n_clusters=images_per_class, random_state=42, n_init='auto').fit(features_in_class)
            for cluster_center in kmeans.cluster_centers_:
                distances = np.linalg.norm(features_in_class - cluster_center, axis=1)
                closest_feature_idx = np.argmin(distances)
                original_data_idx = indices_in_class[closest_feature_idx]
                distilled_indices.append(original_data_idx)
        print(f"\nDistilled coreset size: {len(distilled_indices)}")
        return Subset(self.full_dataset, distilled_indices)


class DistilledModelTrainer:
    def __init__(self, model, train_loader, test_loader, device):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.device = device
        self.criterion = nn.CrossEntropyLoss()

    def train(self, epochs, lr=0.01):
        print(f"\n--- Starting training for {epochs} epochs with LR={lr} ---")
        optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

        for epoch in range(epochs):
            self.model.train()
            running_loss = 0.0
            prog = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{epochs}")
            for images, labels in prog:
                images, labels = images.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                outputs = self.model(images)
                loss = self.criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                prog.set_postfix(loss=f"{running_loss / len(prog):.4f}")
            scheduler.step()

    def evaluate(self, description=""):
        self.model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in self.test_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                outputs = self.model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = 100 * correct / total
        print(f"Accuracy of the model {description}: {accuracy:.2f} %")
        return accuracy


class KnowledgeDistillationTrainer(DistilledModelTrainer):
    def __init__(self, student_model, teacher_model, train_loader, test_loader, device, alpha=0.1, temperature=4):
        super().__init__(student_model, train_loader, test_loader, device)
        self.teacher_model = teacher_model.to(device)
        self.alpha = alpha
        self.temperature = temperature
        self.distillation_criterion = nn.KLDivLoss(reduction='batchmean')
        print(f"\nInitialized Knowledge Distillation Trainer with alpha={self.alpha}, temp={self.temperature}")

    def train(self, epochs, lr=0.01):
        print(f"\n--- Starting Knowledge Distillation training for {epochs} epochs with LR={lr} ---")
        optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
        self.teacher_model.eval()

        for epoch in range(epochs):
            self.model.train()
            running_loss = 0.0
            prog = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{epochs}")

            for images, labels in prog:
                images, labels = images.to(self.device), labels.to(self.device)

                student_outputs = self.model(images)
                with torch.no_grad():
                    teacher_outputs = self.teacher_model(images)

                # Cross-Entropy
                loss_hard = self.criterion(student_outputs, labels)

                # KL Divergence
                loss_soft = self.distillation_criterion(
                    F.log_softmax(student_outputs / self.temperature, dim=1),
                    F.softmax(teacher_outputs / self.temperature, dim=1)
                )

                loss = self.alpha * loss_hard + (1 - self.alpha) * (self.temperature ** 2) * loss_soft

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                prog.set_postfix(loss=f"{running_loss / len(prog):.4f}")

            scheduler.step()


if __name__ == '__main__':
    transform_train_augmented = transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    transform_test_no_aug = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    cifar_path = 'cifar-10-batches-py'
    (X_train, y_train), (X_test, y_test) = load_cifar10_data(cifar_path)

    full_train_dataset_no_aug = Cifar10Raw(X_train, y_train, transform=transform_test_no_aug)
    test_dataset = Cifar10Raw(X_test, y_test, transform=transform_test_no_aug)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)

    print("--- Pre-training a teacher model on the full dataset ---")
    teacher_model = torchvision.models.resnet18(weights=None, num_classes=10)
    full_train_loader = DataLoader(Cifar10Raw(X_train, y_train, transform=transform_train_augmented), batch_size=256, shuffle=True, num_workers=2)
    teacher_trainer = DistilledModelTrainer(teacher_model, full_train_loader, test_loader, DEVICE)
    teacher_trainer.train(epochs=50, lr=0.02)
    acc = teacher_trainer.evaluate(description="No Distillation")

    print("\n--- Running the Coreset distillation process ---")
    distiller = CIFAR10Distiller_Coreset(model=teacher_model, full_dataset=full_train_dataset_no_aug, device=DEVICE)
    distilled_subset = distiller.create_distilled_dataset(images_per_class=500)

    print("\n\n===== EXPERIMENT 1: TRAINING ON CORESET WITHOUT AUGMENTATION =====")
    distilled_loader_no_aug = DataLoader(distilled_subset, batch_size=256, shuffle=True, num_workers=2)
    student_model_no_aug = torchvision.models.resnet18(weights=None, num_classes=10)
    trainer_no_aug = DistilledModelTrainer(student_model_no_aug, distilled_loader_no_aug, test_loader, DEVICE)
    trainer_no_aug.train(epochs=50, lr=0.02)
    acc_no_aug = trainer_no_aug.evaluate(description="on Coreset WITHOUT augmentation")

    print("\n\n===== EXPERIMENT 2: TRAINING ON CORESET WITH AUGMENTATION =====")
    distilled_subset.dataset.transform = transform_train_augmented
    distilled_loader_with_aug = DataLoader(distilled_subset, batch_size=256, shuffle=True, num_workers=2)
    student_model_with_aug = torchvision.models.resnet18(weights=None, num_classes=10)
    trainer_with_aug = DistilledModelTrainer(student_model_with_aug, distilled_loader_with_aug, test_loader, DEVICE)
    trainer_with_aug.train(epochs=50, lr=0.02)
    acc_with_aug = trainer_with_aug.evaluate(description="on Coreset WITH augmentation")

    print("\n\n===== EXPERIMENT 3: TRAINING ON CORESET WITH KNOWLEDGE DISTILLATION =====")
    student_model_kd = torchvision.models.resnet18(weights=None, num_classes=10)

    trainer_kd = KnowledgeDistillationTrainer(
        student_model=student_model_kd,
        teacher_model=teacher_model,
        train_loader=distilled_loader_with_aug,
        test_loader=test_loader,
        device=DEVICE,
        alpha=0.1,
        temperature=4.0
    )
    trainer_kd.train(epochs=50, lr=0.02)
    acc_with_kd = trainer_kd.evaluate(description="on Coreset WITH Knowledge Distillation")

    print("\n\n========== FINAL COMPARISON ==========")
    print(f"Accuracy WITHOUT distillation: {acc:.2f} %")
    print(f"Accuracy WITHOUT augmentation: {acc_no_aug:.2f} %")
    print(f"Accuracy WITH augmentation:    {acc_with_aug:.2f} %")
    print(f"Accuracy WITH Knowledge Distillation: {acc_with_kd:.2f} %") # <-- YENİ EKLEME
    print("======================================")

--- Pre-training a teacher model on the full dataset ---

--- Starting training for 50 epochs with LR=0.02 ---


Epoch 1/50: 100%|██████████| 196/196 [00:08<00:00, 23.16it/s, loss=1.7134]
Epoch 2/50: 100%|██████████| 196/196 [00:07<00:00, 26.31it/s, loss=1.3595]
Epoch 3/50: 100%|██████████| 196/196 [00:07<00:00, 26.23it/s, loss=1.1857]
Epoch 4/50: 100%|██████████| 196/196 [00:07<00:00, 25.90it/s, loss=1.0534]
Epoch 5/50: 100%|██████████| 196/196 [00:07<00:00, 25.59it/s, loss=0.9884]
Epoch 6/50: 100%|██████████| 196/196 [00:07<00:00, 25.11it/s, loss=0.9109]
Epoch 7/50: 100%|██████████| 196/196 [00:07<00:00, 24.72it/s, loss=0.8592]
Epoch 8/50: 100%|██████████| 196/196 [00:07<00:00, 24.63it/s, loss=0.8137]
Epoch 9/50: 100%|██████████| 196/196 [00:07<00:00, 24.86it/s, loss=0.7675]
Epoch 10/50: 100%|██████████| 196/196 [00:07<00:00, 25.24it/s, loss=0.7386]
Epoch 11/50: 100%|██████████| 196/196 [00:07<00:00, 25.61it/s, loss=0.7063]
Epoch 12/50: 100%|██████████| 196/196 [00:07<00:00, 25.71it/s, loss=0.6779]
Epoch 13/50: 100%|██████████| 196/196 [00:07<00:00, 25.83it/s, loss=0.6450]
Epoch 14/50: 100%|███

Accuracy of the model No Distillation: 83.58 %

--- Running the Coreset distillation process ---


Extracting features: 100%|██████████| 196/196 [00:03<00:00, 63.87it/s]
Finding prototypes per class: 100%|██████████| 10/10 [00:53<00:00,  5.33s/it]



Distilled coreset size: 5000


===== EXPERIMENT 1: TRAINING ON CORESET WITHOUT AUGMENTATION =====

--- Starting training for 50 epochs with LR=0.02 ---


Epoch 1/50: 100%|██████████| 20/20 [00:00<00:00, 21.34it/s, loss=2.0658]
Epoch 2/50: 100%|██████████| 20/20 [00:00<00:00, 23.79it/s, loss=1.6220]
Epoch 3/50: 100%|██████████| 20/20 [00:00<00:00, 23.91it/s, loss=1.3494]
Epoch 4/50: 100%|██████████| 20/20 [00:00<00:00, 23.75it/s, loss=0.9643]
Epoch 5/50: 100%|██████████| 20/20 [00:00<00:00, 23.90it/s, loss=0.7037]
Epoch 6/50: 100%|██████████| 20/20 [00:00<00:00, 23.56it/s, loss=0.5426]
Epoch 7/50: 100%|██████████| 20/20 [00:00<00:00, 23.48it/s, loss=0.4705]
Epoch 8/50: 100%|██████████| 20/20 [00:00<00:00, 23.61it/s, loss=0.3648]
Epoch 9/50: 100%|██████████| 20/20 [00:00<00:00, 23.49it/s, loss=0.2794]
Epoch 10/50: 100%|██████████| 20/20 [00:00<00:00, 23.52it/s, loss=0.1921]
Epoch 11/50: 100%|██████████| 20/20 [00:00<00:00, 23.55it/s, loss=0.1234]
Epoch 12/50: 100%|██████████| 20/20 [00:00<00:00, 23.41it/s, loss=0.0944]
Epoch 13/50: 100%|██████████| 20/20 [00:00<00:00, 23.68it/s, loss=0.0545]
Epoch 14/50: 100%|██████████| 20/20 [00:00<00:0

Accuracy of the model on Coreset WITHOUT augmentation: 51.28 %


===== EXPERIMENT 2: TRAINING ON CORESET WITH AUGMENTATION =====

--- Starting training for 50 epochs with LR=0.02 ---


Epoch 1/50: 100%|██████████| 20/20 [00:00<00:00, 22.13it/s, loss=2.2285]
Epoch 2/50: 100%|██████████| 20/20 [00:00<00:00, 22.07it/s, loss=1.9099]
Epoch 3/50: 100%|██████████| 20/20 [00:00<00:00, 22.31it/s, loss=1.7189]
Epoch 4/50: 100%|██████████| 20/20 [00:00<00:00, 22.09it/s, loss=1.5983]
Epoch 5/50: 100%|██████████| 20/20 [00:00<00:00, 22.01it/s, loss=1.5115]
Epoch 6/50: 100%|██████████| 20/20 [00:00<00:00, 22.04it/s, loss=1.4506]
Epoch 7/50: 100%|██████████| 20/20 [00:00<00:00, 22.09it/s, loss=1.4260]
Epoch 8/50: 100%|██████████| 20/20 [00:00<00:00, 22.37it/s, loss=1.3823]
Epoch 9/50: 100%|██████████| 20/20 [00:00<00:00, 22.34it/s, loss=1.3234]
Epoch 10/50: 100%|██████████| 20/20 [00:00<00:00, 22.32it/s, loss=1.2712]
Epoch 11/50: 100%|██████████| 20/20 [00:00<00:00, 22.43it/s, loss=1.2223]
Epoch 12/50: 100%|██████████| 20/20 [00:00<00:00, 22.52it/s, loss=1.1913]
Epoch 13/50: 100%|██████████| 20/20 [00:00<00:00, 22.51it/s, loss=1.1723]
Epoch 14/50: 100%|██████████| 20/20 [00:00<00:0

Accuracy of the model on Coreset WITH augmentation: 56.89 %


===== EXPERIMENT 3: TRAINING ON CORESET WITH KNOWLEDGE DISTILLATION =====

Initialized Knowledge Distillation Trainer with alpha=0.1, temp=4.0

--- Starting Knowledge Distillation training for 50 epochs with LR=0.02 ---


Epoch 1/50: 100%|██████████| 20/20 [00:01<00:00, 17.15it/s, loss=12.2760]
Epoch 2/50: 100%|██████████| 20/20 [00:01<00:00, 18.67it/s, loss=9.9716]
Epoch 3/50: 100%|██████████| 20/20 [00:01<00:00, 18.62it/s, loss=8.8745]
Epoch 4/50: 100%|██████████| 20/20 [00:01<00:00, 18.47it/s, loss=7.9289]
Epoch 5/50: 100%|██████████| 20/20 [00:01<00:00, 18.46it/s, loss=7.5234]
Epoch 6/50: 100%|██████████| 20/20 [00:01<00:00, 18.31it/s, loss=7.1366]
Epoch 7/50: 100%|██████████| 20/20 [00:01<00:00, 18.33it/s, loss=6.5735]
Epoch 8/50: 100%|██████████| 20/20 [00:01<00:00, 18.39it/s, loss=6.1392]
Epoch 9/50: 100%|██████████| 20/20 [00:01<00:00, 18.49it/s, loss=6.1272]
Epoch 10/50: 100%|██████████| 20/20 [00:01<00:00, 18.54it/s, loss=5.5865]
Epoch 11/50: 100%|██████████| 20/20 [00:01<00:00, 18.32it/s, loss=5.2557]
Epoch 12/50: 100%|██████████| 20/20 [00:01<00:00, 18.33it/s, loss=5.1023]
Epoch 13/50: 100%|██████████| 20/20 [00:01<00:00, 18.43it/s, loss=4.9331]
Epoch 14/50: 100%|██████████| 20/20 [00:01<00:

Accuracy of the model on Coreset WITH Knowledge Distillation: 70.21 %


Accuracy WITHOUT distillation: 83.58 %
Accuracy WITHOUT augmentation: 51.28 %
Accuracy WITH augmentation:    56.89 %
Accuracy WITH Knowledge Distillation: 70.21 %


# Results
---

In [None]:
TRAIN = True
if TRAIN:
    plt.plot(trainer.metrics['train_loss'],color='red',label='train loss')
    plt.plot(trainer.metrics['val_loss'],color='orange',label='valid loss')
    plt.title('loss, lower=better')
    plt.legend()
    plt.show()
    plt.figure()
    plt.plot(trainer.metrics['train_perplexity'],color='blue',label='train perplexity')
    plt.plot(trainer.metrics['val_perplexity'],color='lightblue',label='valid perplexity')
    plt.title('perplexity, lower=better')
    plt.legend()
    plt.show()

NameError: name 'plt' is not defined