In [1]:
f = open("categories.txt","r")
# And for reading use
classes = f.readlines()
f.close()

In [2]:
classes = [c.replace('\n','').replace(' ','_') for c in classes]
print(classes)
print(len(classes))

['aircraft_carrier', 'airplane', 'alarm_clock', 'ambulance', 'angel', 'animal_migration', 'ant', 'anvil', 'apple', 'arm', 'asparagus', 'axe', 'backpack', 'banana', 'bandage', 'barn', 'baseball', 'baseball_bat', 'basket', 'basketball', 'bat', 'bathtub', 'beach', 'bear', 'beard', 'bed', 'bee', 'belt', 'bench', 'bicycle', 'binoculars', 'bird', 'birthday_cake', 'blackberry', 'blueberry', 'book', 'boomerang', 'bottlecap', 'bowtie', 'bracelet', 'brain', 'bread', 'bridge', 'broccoli', 'broom', 'bucket', 'bulldozer', 'bus', 'bush', 'butterfly', 'cactus', 'cake', 'calculator', 'calendar', 'camel', 'camera', 'camouflage', 'campfire', 'candle', 'cannon', 'canoe', 'car', 'carrot', 'castle', 'cat', 'ceiling_fan', 'cello', 'cell_phone', 'chair', 'chandelier', 'church', 'circle', 'clarinet', 'clock', 'cloud', 'coffee_cup', 'compass', 'computer', 'cookie', 'cooler', 'couch', 'cow', 'crab', 'crayon', 'crocodile', 'crown', 'cruise_ship', 'cup', 'diamond', 'dishwasher', 'diving_board', 'dog', 'dolphin', 

In [None]:
# ###################################### download data ######################################

# import urllib
# import urllib.request
# # data = urllib.request.urlretrieve("http://...")
# def download():
  
#   base = 'https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/'
#   for c in classes:
#     cls_url = c.replace('_', '%20')
#     path = base+cls_url+'.npy'
#     print(path)
#     urllib.request.urlretrieve(path, 'data/'+c+'.npy')

# download()

In [30]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import os
import torchvision.transforms as transforms
from torchvision import models
import torch.nn as nn
from tqdm import tqdm 
import torch.optim as optim

num_workers = 0
subset_size = 10000
num_epochs = 10

class MultiNpyDataset(Dataset):
    def __init__(self, npy_files, transform=None):
        self.npy_files = npy_files
        self.transform = transform
        
        # Calculate total length and file mappings
        self.file_sizes = []
        self.cumulative_sizes = [0]
        
        for npy_file in self.npy_files:
            # Get file size without loading entire array
            np_array = np.load(npy_file, mmap_mode='r')
            np_array = np_array[:subset_size]
            size = len(np_array)
            self.file_sizes.append(size)
            self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
            # with open(npy_file, 'rb') as f:
            #     np_array = np.load(f, mmap_mode='r')
            #     size = len(np_array)
            #     self.file_sizes.append(size)
            #     self.cumulative_sizes.append(self.cumulative_sizes[-1] + size)
    
    def __len__(self):
        return self.cumulative_sizes[-1]
    
    def __getitem__(self, idx):
        # Find which file contains this index
        file_idx = np.searchsorted(self.cumulative_sizes, idx, side='right') - 1
        idx_in_file = idx - self.cumulative_sizes[file_idx]
        
        # Load only the required sample using memory mapping
        x_array = np.load(self.npy_files[file_idx], mmap_mode='r')[:subset_size]
        x = x_array[idx_in_file].copy()
        # with open(self.npy_files[file_idx], 'rb') as f:
        #     x_array = np.load(f, mmap_mode='r')
        #     x = x_array[idx_in_file].copy()  # Make a copy to ensure it's writable
        
        # Convert to tensor
        # x = torch.from_numpy(x).float()
        # print(type(x))
        x = x.reshape((28,28,1))
        if transform:
            x = transform(x)
        # print(type(x))
        
        return x, file_idx
        

transform = transforms.Compose([
    transforms.ToTensor(),
    # transforms.Resize((28,28)),
    transforms.Lambda(lambda x: x.float()),
    transforms.Resize((224, 224)),  # Resize to match ResNet50's expected dimensions
    # transforms.Grayscale(3)         # Convert grayscale to 3-channel by replication
    # Alternatively: transforms.Lambda(lambda x: x.repeat(3, 1, 1)) for tensor inputs
    transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x)
])
# image_vector_reshaped = data_1[0].reshape((28,28,1))
# transform(image_vector_reshaped)


npy_files = [f'data/{file_name}.npy' for file_name in classes]  # Adjust path as needed
dataset = MultiNpyDataset(npy_files)
dataloader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True
)

In [31]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])


# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=num_workers)

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Get the number of classes
num_classes = 345
print(f"Number of classes: {num_classes}")


# Load pre-trained ResNet50 model
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

# Initially freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Replace the final fully connected layer
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
# Make sure the fc layer parameters require gradients
for param in model.fc.parameters():
    param.requires_grad = True

# Move model to device
model = model.to(device)

# Define loss function
criterion = nn.CrossEntropyLoss()

Using device: cuda
Number of classes: 345


In [33]:
import mlflow
# import mlflow.pytorch
from tqdm import tqdm
import torch.optim as optim
import torch

In [34]:
def train_one_phase(model, criterion, optimizer, train_loader, val_loader, num_epochs=3, device='cuda'):
    best_acc = 0.0
    
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)
        
        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0
        
        for inputs, labels in tqdm(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
        
        train_loss = running_loss / len(train_loader.dataset)
        train_acc = running_corrects.double() / len(train_loader.dataset)
        
        print(f'Train Loss: {train_loss:.4f} Acc: {train_acc:.4f}')
        mlflow.log_metric('train_loss', train_loss, step=epoch)
        mlflow.log_metric('train_acc', train_acc.item(), step=epoch)
        
        # Validation phase
        model.eval()
        running_loss = 0.0
        running_corrects = 0
        
        with torch.no_grad():
            for inputs, labels in tqdm(val_loader):
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                # Forward pass
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
        
        val_loss = running_loss / len(val_loader.dataset)
        val_acc = running_corrects.double() / len(val_loader.dataset)
        
        print(f'Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}')
        mlflow.log_metric('val_loss', val_loss, step=epoch)
        mlflow.log_metric('val_acc', val_acc.item(), step=epoch)


        # Save the best model
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), 'best_model_last_layer_unfreezed.pth')
            mlflow.log_metric('best_val_acc', best_acc.item(), step=epoch)
            
        print()

In [36]:
def check_unfreeze(name, list_of_names_to_unfreeze):
    for unfreeze_name in list_of_names_to_unfreeze:
        if unfreeze_name in name:
            return True
    return False


def unfreeze_and_train(model, criterion, train_loader, val_loader, device, layer_names_to_unfreeze, lr, phase_no):
    ############ Phase 1: Train only the fully connected layer ###########
    print("Phase "+str(phase_no)+": Training")
    # Make sure all other layers are frozen
    for name, param in model.named_parameters():
        if check_unfreeze(name, layer_names_to_unfreeze):
            # print(name, False)
            param.requires_grad = True
        else:
            # print(name, True)
            param.requires_grad = False
        

    # Create an optimizer that only updates the fc parameters
    print()
    optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=lr)
    train_one_phase(model, criterion, optimizer, train_loader, val_loader, num_epochs=num_epochs, device=device)
    return model

In [28]:
# Example MLflow experiment run
mlflow.set_experiment("doodle-clasifier-initial")
with mlflow.start_run(run_name="doodle-classifier-initial-run"):
    mlflow.log_param("learning_rate", 0.001)
    mlflow.log_param("unfrozen_layers", ['fc'])
    mlflow.log_param("phase_no", 1)
    model = unfreeze_and_train(model, criterion, train_loader, val_loader, device, layer_names_to_unfreeze=['fc'], lr=0.001, phase_no=1)

Phase 1: Training

Epoch 1/3
----------


100%|██████████| 540/540 [10:49<00:00,  1.20s/it]


Train Loss: 3.7759 Acc: 0.3013


100%|██████████| 135/135 [02:46<00:00,  1.23s/it]


Val Loss: 3.0529 Acc: 0.3822

Epoch 2/3
----------


100%|██████████| 540/540 [10:34<00:00,  1.18s/it]


Train Loss: 2.7636 Acc: 0.4293


100%|██████████| 135/135 [02:37<00:00,  1.17s/it]


Val Loss: 2.7308 Acc: 0.4205

Epoch 3/3
----------


100%|██████████| 540/540 [10:57<00:00,  1.22s/it]


Train Loss: 2.4727 Acc: 0.4740


100%|██████████| 135/135 [02:13<00:00,  1.01it/s]


Val Loss: 2.5869 Acc: 0.4417

🏃 View run doodle-classifier-initial-run at: http://localhost:5000/#/experiments/451096862960654107/runs/40a4e5e5015b4f4ab3c8c87affd6e63a
🧪 View experiment at: http://localhost:5000/#/experiments/451096862960654107


In [29]:
for lr in [0.001, 0.0005]:
    for layers in [['fc'], ['fc', 'layer4']]:
        with mlflow.start_run(run_name=f"MyRun_lr_{lr}_layers_{'_'.join(layers)}"):
            mlflow.log_param("learning_rate", lr)
            mlflow.log_param("unfrozen_layers", layers)
            model = unfreeze_and_train(model, criterion, train_loader, val_loader, device, layer_names_to_unfreeze=layers, lr=lr, phase_no=1)

Phase 1: Training

Epoch 1/3
----------


100%|██████████| 540/540 [11:35<00:00,  1.29s/it]


Train Loss: 2.3009 Acc: 0.5009


100%|██████████| 135/135 [02:55<00:00,  1.30s/it]


Val Loss: 2.4906 Acc: 0.4540

Epoch 2/3
----------


100%|██████████| 540/540 [11:39<00:00,  1.30s/it]


Train Loss: 2.1704 Acc: 0.5268


100%|██████████| 135/135 [02:32<00:00,  1.13s/it]


Val Loss: 2.4444 Acc: 0.4605

Epoch 3/3
----------


100%|██████████| 540/540 [10:04<00:00,  1.12s/it]


Train Loss: 2.0744 Acc: 0.5449


100%|██████████| 135/135 [02:07<00:00,  1.06it/s]


Val Loss: 2.4279 Acc: 0.4632

🏃 View run MyRun_lr_0.001_layers_fc at: http://localhost:5000/#/experiments/451096862960654107/runs/a1756296ba174e8ca2d17904704479d9
🧪 View experiment at: http://localhost:5000/#/experiments/451096862960654107
Phase 1: Training

Epoch 1/3
----------


100%|██████████| 540/540 [11:08<00:00,  1.24s/it]


Train Loss: 1.6763 Acc: 0.5997


100%|██████████| 135/135 [02:08<00:00,  1.05it/s]


Val Loss: 1.6091 Acc: 0.6134

Epoch 2/3
----------


100%|██████████| 540/540 [12:10<00:00,  1.35s/it]


Train Loss: 1.1828 Acc: 0.7077


100%|██████████| 135/135 [02:16<00:00,  1.01s/it]


Val Loss: 1.5480 Acc: 0.6266

Epoch 3/3
----------


100%|██████████| 540/540 [11:24<00:00,  1.27s/it]


Train Loss: 0.8350 Acc: 0.7904


100%|██████████| 135/135 [02:52<00:00,  1.28s/it]


Val Loss: 1.5950 Acc: 0.6260

🏃 View run MyRun_lr_0.001_layers_fc_layer4 at: http://localhost:5000/#/experiments/451096862960654107/runs/c4731b12cc1f42ddae8bba5627e911fb
🧪 View experiment at: http://localhost:5000/#/experiments/451096862960654107
Phase 1: Training

Epoch 1/3
----------


100%|██████████| 540/540 [10:19<00:00,  1.15s/it]


Train Loss: 0.4500 Acc: 0.9076


100%|██████████| 135/135 [02:13<00:00,  1.01it/s]


Val Loss: 1.4817 Acc: 0.6494

Epoch 2/3
----------


100%|██████████| 540/540 [11:15<00:00,  1.25s/it]


Train Loss: 0.4047 Acc: 0.9205


100%|██████████| 135/135 [02:52<00:00,  1.28s/it]


Val Loss: 1.4894 Acc: 0.6497

Epoch 3/3
----------


100%|██████████| 540/540 [11:13<00:00,  1.25s/it]


Train Loss: 0.3744 Acc: 0.9284


100%|██████████| 135/135 [02:32<00:00,  1.13s/it]


Val Loss: 1.4971 Acc: 0.6503

🏃 View run MyRun_lr_0.0005_layers_fc at: http://localhost:5000/#/experiments/451096862960654107/runs/dc539156595b414faecbf38c22c5d08e
🧪 View experiment at: http://localhost:5000/#/experiments/451096862960654107
Phase 1: Training

Epoch 1/3
----------


100%|██████████| 540/540 [12:01<00:00,  1.34s/it]


Train Loss: 0.3270 Acc: 0.9239


100%|██████████| 135/135 [02:38<00:00,  1.17s/it]


Val Loss: 1.7104 Acc: 0.6272

Epoch 2/3
----------


100%|██████████| 540/540 [11:01<00:00,  1.22s/it]


Train Loss: 0.1106 Acc: 0.9803


100%|██████████| 135/135 [02:36<00:00,  1.16s/it]


Val Loss: 1.8917 Acc: 0.6235

Epoch 3/3
----------


100%|██████████| 540/540 [12:01<00:00,  1.34s/it]


Train Loss: 0.0504 Acc: 0.9929


100%|██████████| 135/135 [02:33<00:00,  1.14s/it]


Val Loss: 2.0664 Acc: 0.6260

🏃 View run MyRun_lr_0.0005_layers_fc_layer4 at: http://localhost:5000/#/experiments/451096862960654107/runs/e53d238c10cc44a89db9393bac890e77
🧪 View experiment at: http://localhost:5000/#/experiments/451096862960654107


In [13]:
model = unfreeze_and_train(model, criterion, train_loader, val_loader, device, layer_names_to_unfreeze=['fc'], lr=0.001, phase_no=1)

Phase 1: Training

Epoch 1/3
----------


100%|██████████| 8625/8625 [10:47<00:00, 13.32it/s]


Train Loss: 3.1407 Acc: 0.3436


100%|██████████| 2157/2157 [02:16<00:00, 15.86it/s]


Val Loss: 2.6407 Acc: 0.4212

Epoch 2/3
----------


100%|██████████| 8625/8625 [09:26<00:00, 15.23it/s]


Train Loss: 2.4655 Acc: 0.4524


100%|██████████| 2157/2157 [02:20<00:00, 15.38it/s]


Val Loss: 2.5080 Acc: 0.4495

Epoch 3/3
----------


100%|██████████| 8625/8625 [14:06<00:00, 10.19it/s]


Train Loss: 2.2813 Acc: 0.4866


100%|██████████| 2157/2157 [05:11<00:00,  6.92it/s]

Val Loss: 2.4951 Acc: 0.4444






In [37]:
model = unfreeze_and_train(model, criterion, train_loader, val_loader, device, layer_names_to_unfreeze=['fc', 'layer4'], lr=0.0005, phase_no=1)

Phase 1: Training

Epoch 1/10
----------


100%|██████████| 5391/5391 [1:53:17<00:00,  1.26s/it]


Train Loss: 1.3333 Acc: 0.6708


100%|██████████| 1348/1348 [26:59<00:00,  1.20s/it]


Val Loss: 1.1369 Acc: 0.7142

Epoch 2/10
----------


100%|██████████| 5391/5391 [2:02:58<00:00,  1.37s/it]  


Train Loss: 1.0281 Acc: 0.7382


100%|██████████| 1348/1348 [24:26<00:00,  1.09s/it]


Val Loss: 1.0641 Acc: 0.7314

Epoch 3/10
----------


100%|██████████| 5391/5391 [1:44:49<00:00,  1.17s/it]  


Train Loss: 0.9068 Acc: 0.7659


MlflowException: API request to http://localhost:5000/api/2.0/mlflow/runs/log-metric failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow/runs/log-metric (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000014653E01E10>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))