In [9]:
#+.+
import torch 
import torch.nn as nn 
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader,random_split
device=torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Using device: ",device)

Using device:  mps


**INPUT SIZE AND RGB NORMALIZATION**

RGB mean: 0.539095009313355, 0.5253846275760173, 0.47516918293895094 

RGB std: 0.29325260016825044, 0.2845220684942233, 0.30522874839608877

280x196 


In [2]:
#define image transformations
transform=transforms.Compose([
    transforms.Resize((280,196)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5391,0.5254,0.4752],std=[0.2933,0.2845,0.3052])
])

dataset=ImageFolder(root='data/animals',transform=transform) #dataset[i][0]: image tensor (3,280,196), dataset[i][1]: image integer label

**TRAINING FUNCTION**

In [10]:
def train_model(model,train_loader,val_loader,test_loader,optimizer,criterion,device,epochs,scheduler):
    model.to(device)
    
    for epoch in range(epochs): 
        model.train()
        train_loss=0
        for images,labels in train_loader: 
            images,labels=images.to(device),labels.to(device)
            optimizer.zero_grad() #clear previous gradients
            outputs=model(images) #forward pass
            loss=criterion(outputs,labels) #compute loss
            loss.backward() #backpropagate
            optimizer.step() #update weights
            train_loss+=loss.item()*images.size(0)
        train_loss/=len(train_loader.dataset)
        
        model.eval()
        val_loss=0.0
        with torch.no_grad(): 
            for images,labels in val_loader:
                images,labels=images.to(device),labels.to(device)
                outputs=model(images)
                loss=criterion(outputs,labels)
                val_loss+=loss.item()*images.size(0)
        val_loss/=len(val_loader.dataset)
        scheduler.step(val_loss)
        print(
            f"Epoch {epoch+1:02d} | "
            f"Train Loss: {train_loss:.4f} | "
            f"Val Loss: {val_loss:.4f} | "
            f"LR: {optimizer.param_groups[0]['lr']:.2e}"
        )
    
    model.eval()
    correct,total=0,0
    with torch.no_grad(): 
        for images,labels in test_loader: 
            images,labels=images.to(device),labels.to(device)
            outputs=model(images)
            predicted=outputs.argmax(dim=1)
            total+=labels.size(0)
            correct+=(predicted==labels).sum().item() 
    accuracy=correct/total
    return f"Test accuracy: {accuracy*100:.2f}%"

**BASE MODEL**

Input (3 x 280 x 196) 

▼

Conv2d(3 → 32, kernel=3, padding=1) -> ReLU -> MaxPool2d(2x2): Output Block1(32 x 140 x 98)

▼

Conv2d(32 → 64, kernel=3, padding=1) -> ReLu -> MaxPool2d(2x2): Output Block2(64 x 70 x 49)

▼

Flatten (64x70x49 = 219,520 features)

▼

Linear Layer 1: 219,520 → 256 -> ReLu

▼

Linear Layer 2: 256 → 90 logits

▼

Output (90 logits → one per animal class)

In [4]:
class SimpleCNN(nn.Module): 
    def __init__(self,num_classes=90): 
        super().__init__()
        
        #feature extraction
        self.conv1=nn.Conv2d(3,32,kernel_size=3,padding=1) #1st convolution: 3 in channels (RGB), 32 out channels ("pattern maps"), kernel size of 3, padding of 1 [(3-1)//2]
        self.conv2=nn.Conv2d(32,64,kernel_size=3,padding=1) #2nd convolution: 32 in channels, 64 out channels, kernel size of 3
        self.pool=nn.MaxPool2d(kernel_size=2) #max pool: kernel size of 2 (every 2x2 region becomes 1 pixel, halving both width and height)
        
        #classifying
        self.fc1=nn.Linear(64*70*49,256) #1st linear layer: input -> 64 channels * 70 width pixels * 49 height pixels, output -> 256 "hidden features" [fc: fully connected layer->every input node connected to every output node]
        self.fc2=nn.Linear(256,num_classes) #2nd linear layer: input -> 256 "hidden features", output -> 90 desired animal classes
        
    def forward(self,x): #x->input batch
        x=self.pool(F.relu(self.conv1(x))) #Block 1: Conv1->ReLu->Pool->Output (32,140,98)
        x=self.pool(F.relu(self.conv2(x))) #Block 2: Conv2->ReLu->Pool->Output (64,70,49)
        x=x.view(x.size(0),-1) #Flatten: (batch_size,64*70*49) -> ready for linear layers
        x=F.relu(self.fc1(x)) #fc1 -> ReLu -> compressed to 256 hidden features
        x=self.fc2(x) #fc2 -> output 90 logits (one per animal class)
        
        return x #return logits (class "scores"), logits fed into loss function in training

In [None]:
#train inputs
model=SimpleCNN(num_classes=90).to(device)
criterion=nn.CrossEntropyLoss() #loss function for multi-class classification
optimizer=torch.optim.Adam(model.parameters(),lr=3e-4)
train_size=int(0.7*len(dataset)) 
val_size=int(0.15*len(dataset))
test_size=len(dataset)-train_size-val_size
train_dataset,val_dataset,test_dataset=random_split(dataset,[train_size,val_size,test_size])
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=16,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)
epochs=15 
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau( #'scheduler'
    optimizer,
    mode='min',
    factor=0.5,
    patience=2,
    verbose=True
)

train_model(model,train_loader,val_loader,test_loader,optimizer,criterion,device,epochs,scheduler)




Epoch 01 | Train Loss: 4.3977 | Val Loss: 4.1412 | LR: 3.00e-04
Epoch 02 | Train Loss: 3.5412 | Val Loss: 3.5497 | LR: 3.00e-04
Epoch 03 | Train Loss: 1.9284 | Val Loss: 3.3545 | LR: 3.00e-04
Epoch 04 | Train Loss: 0.4809 | Val Loss: 3.9657 | LR: 3.00e-04
Epoch 05 | Train Loss: 0.0839 | Val Loss: 4.3617 | LR: 3.00e-04
Epoch 06 | Train Loss: 0.0383 | Val Loss: 4.4664 | LR: 1.50e-04
Epoch 07 | Train Loss: 0.0122 | Val Loss: 4.5867 | LR: 1.50e-04
Epoch 08 | Train Loss: 0.0070 | Val Loss: 4.6086 | LR: 1.50e-04
Epoch 09 | Train Loss: 0.0033 | Val Loss: 4.7795 | LR: 7.50e-05
Epoch 10 | Train Loss: 0.0052 | Val Loss: 4.7298 | LR: 7.50e-05
Epoch 11 | Train Loss: 0.0036 | Val Loss: 4.7597 | LR: 7.50e-05
Epoch 12 | Train Loss: 0.0027 | Val Loss: 4.8213 | LR: 3.75e-05
Epoch 13 | Train Loss: 0.0028 | Val Loss: 4.8257 | LR: 3.75e-05
Epoch 14 | Train Loss: 0.0019 | Val Loss: 4.8517 | LR: 3.75e-05
Epoch 15 | Train Loss: 0.0019 | Val Loss: 4.8798 | LR: 1.87e-05


'Test accuracy: 33.29%'

Not the worst. Considering that random chance is ~1.1%, this still performs well for a base model. Despite loss being tiny, generalization is poor: likely overfitting. There are only 60 images per class which is very small for 90 classes. 

**RESNET18**

In [None]:
from torchvision import models 
resnet18=models.resnet18(pretrained=True)
resnet18.fc=nn.Linear(resnet18.fc.in_features,90)
model=resnet18.to(device)
criterion=nn.CrossEntropyLoss() #loss function for multi-class classification
optimizer=torch.optim.Adam(model.parameters(),lr=3e-4)
train_size=int(0.7*len(dataset)) 
val_size=int(0.15*len(dataset))
test_size=len(dataset)-train_size-val_size
train_dataset,val_dataset,test_dataset=random_split(dataset,[train_size,val_size,test_size])
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=16,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)
epochs=15 
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau( #'scheduler'
    optimizer,
    mode='min',
    factor=0.5,
    patience=2,
    verbose=True
)

train_model(model,train_loader,val_loader,test_loader,optimizer,criterion,device,epochs,scheduler)



Epoch 01 | Train Loss: 2.3593 | Val Loss: 1.5182 | LR: 3.00e-04
Epoch 02 | Train Loss: 0.7423 | Val Loss: 1.0399 | LR: 3.00e-04
Epoch 03 | Train Loss: 0.2258 | Val Loss: 0.8109 | LR: 3.00e-04
Epoch 04 | Train Loss: 0.0866 | Val Loss: 0.6967 | LR: 3.00e-04
Epoch 05 | Train Loss: 0.0478 | Val Loss: 0.6798 | LR: 3.00e-04
Epoch 06 | Train Loss: 0.0229 | Val Loss: 0.6402 | LR: 3.00e-04
Epoch 07 | Train Loss: 0.0142 | Val Loss: 0.6428 | LR: 3.00e-04
Epoch 08 | Train Loss: 0.0801 | Val Loss: 1.8057 | LR: 3.00e-04
Epoch 09 | Train Loss: 0.6577 | Val Loss: 1.3599 | LR: 1.50e-04
Epoch 10 | Train Loss: 0.0829 | Val Loss: 0.7839 | LR: 1.50e-04
Epoch 11 | Train Loss: 0.0161 | Val Loss: 0.7050 | LR: 1.50e-04
Epoch 12 | Train Loss: 0.0121 | Val Loss: 0.7244 | LR: 7.50e-05
Epoch 13 | Train Loss: 0.0069 | Val Loss: 0.7138 | LR: 7.50e-05
Epoch 14 | Train Loss: 0.0062 | Val Loss: 0.7119 | LR: 7.50e-05
Epoch 15 | Train Loss: 0.0046 | Val Loss: 0.6912 | LR: 3.75e-05


'Test accuracy: 82.12%'

Wow, a much better performance from the pretrained Resnet18 model (it is actually very impressive to get almost 60% accuracy on a classification task involving 90 categories). The training loss drops steadily, although slower than the SimpleCNN it is more gradual which indicates stable learning. This likely means the model is learning meaningful features rather than just memorizing. 

Why it's so much better (maybe): 

- Residual connections use 
- Pretrained on ImageNet

Let's try improving 'complexity' by making the CNN deeper with extra layers, thus increasing number of channels (more "pattern maps"). Our new "deeper" model will have more abstraction and pattern maps, while having fewer classifier parameters due to pooling in the extra layers.


**DEEPER MODEL**

Input (3 x 280 x 196) 

▼

Conv2d(3 → 32, kernel=3, padding=1) -> ReLU -> MaxPool2d(2x2): Output Block1(32 x 140 x 98)

▼

Conv2d(32 → 64, kernel=3, padding=1) -> ReLu -> MaxPool2d(2x2): Output Block2(64 x 70 x 49)

▼

Conv2d(64 → 128, kernel=3, padding=1) -> ReLu -> MaxPool2d(2x2): Output Block3(128 x 35 x 24)

▼

Conv2d(128 → 256, kernel=3, padding=1) -> ReLu -> MaxPool2d(2x2): Output Block3(256 x 17 x 12)

▼

Flatten (256x17x12 = 52,224 features)

▼

Linear Layer 1: 52,224 → 256 -> ReLu

▼

Linear Layer 2: 256 → 90 logits

▼

Output (90 logits → one per animal class)

In [7]:
class DeeperCNN(nn.Module): 
    def __init__(self,num_classes=90): 
        super().__init__()
        
        #feature extraction
        self.conv1=nn.Conv2d(3,32,kernel_size=3,padding=1) #1st convolution: 3 in channels (RGB), 32 out channels ("pattern maps"), kernel size of 3, padding of 1 [(3-1)//2]
        self.conv2=nn.Conv2d(32,64,kernel_size=3,padding=1) #2nd convolution: 32 in channels, 64 out channels, kernel size of 3
        self.conv3=nn.Conv2d(64,128,kernel_size=3,padding=1) #3rd convolution: 64 in channels, 128 out channels, kernel size of 3
        self.conv4=nn.Conv2d(128,256,kernel_size=3,padding=1) #4th convolution: 128 in channels, 256 out channels, kernel size of 3
        self.pool=nn.MaxPool2d(kernel_size=2) #max pool: kernel size of 2 (every 2x2 region becomes 1 pixel, halving both width and height)
        
        #classifying
        self.fc1=nn.Linear(256*17*12,256) #1st linear layer: input -> 256 channels * 17 width pixels * 12 height pixels, output -> 256 "hidden features" [fc: fully connected layer->every input node connected to every output node]
        self.fc2=nn.Linear(256,num_classes) #2nd linear layer: input -> 256 "hidden features", output -> 90 desired animal classes
        
    def forward(self,x): #x->input batch
        x=self.pool(F.relu(self.conv1(x))) #Block 1: Conv1->ReLu->Pool->Output (32,140,98)
        x=self.pool(F.relu(self.conv2(x))) #Block 2: Conv2->ReLu->Pool->Output (64,70,49)
        x=self.pool(F.relu(self.conv3(x))) #Block 3: Conv3->ReLu->Pool->Output (128,35,24)
        x=self.pool(F.relu(self.conv4(x))) #Block 4: Conv4->ReLu->Pool->Output (256,17,12)
        x=x.view(x.size(0),-1) #Flatten: (batch_size,256*17*12) -> ready for linear layers
        x=F.relu(self.fc1(x)) #fc1 -> ReLu -> compressed to 256 hidden features
        x=self.fc2(x) #fc2 -> output 90 logits (one per animal class)
        
        return x #return logits (class "scores"), logits fed into loss function in training

In [None]:
#train inputs
model=DeeperCNN(num_classes=90).to(device)
criterion=nn.CrossEntropyLoss() #loss function for multi-class classification
optimizer=torch.optim.Adam(model.parameters(),lr=3e-4)
train_size=int(0.7*len(dataset)) 
val_size=int(0.15*len(dataset))
test_size=len(dataset)-train_size-val_size
train_dataset,val_dataset,test_dataset=random_split(dataset,[train_size,val_size,test_size])
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=16,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)
epochs=15 
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau( #'scheduler'
    optimizer,
    mode='min',
    factor=0.5,
    patience=2,
    verbose=True
)

train_model(model,train_loader,val_loader,test_loader,optimizer,criterion,device,epochs,scheduler)

Epoch 01 | Train Loss: 4.3956 | Val Loss: 4.2000 | LR: 3.00e-04
Epoch 02 | Train Loss: 3.9337 | Val Loss: 3.9235 | LR: 3.00e-04
Epoch 03 | Train Loss: 3.3934 | Val Loss: 3.5780 | LR: 3.00e-04
Epoch 04 | Train Loss: 2.6563 | Val Loss: 3.4527 | LR: 3.00e-04
Epoch 05 | Train Loss: 1.6424 | Val Loss: 3.5866 | LR: 3.00e-04
Epoch 06 | Train Loss: 0.6860 | Val Loss: 4.4548 | LR: 3.00e-04
Epoch 07 | Train Loss: 0.2242 | Val Loss: 5.4324 | LR: 1.50e-04
Epoch 08 | Train Loss: 0.0528 | Val Loss: 5.7543 | LR: 1.50e-04
Epoch 09 | Train Loss: 0.0134 | Val Loss: 6.1493 | LR: 1.50e-04
Epoch 10 | Train Loss: 0.0050 | Val Loss: 6.6876 | LR: 7.50e-05
Epoch 11 | Train Loss: 0.0031 | Val Loss: 6.7426 | LR: 7.50e-05
Epoch 12 | Train Loss: 0.0013 | Val Loss: 6.9017 | LR: 7.50e-05
Epoch 13 | Train Loss: 0.0011 | Val Loss: 7.0458 | LR: 3.75e-05
Epoch 14 | Train Loss: 0.0009 | Val Loss: 7.1213 | LR: 3.75e-05
Epoch 15 | Train Loss: 0.0008 | Val Loss: 7.2008 | LR: 3.75e-05


'Test accuracy: 38.47%'

Solid improvement! Looks like adding those two extra conv layers helped improve our model. But we are ambitious, so we want to try and match ResNet18. Let's make the network even deeper. We will also add a dropout to prevent overfitting during training. 

**SUPER DEEP MODEL**

Input (3 x 280 x 196) 

▼

Conv2d(3 → 32, kernel=3, padding=1) -> ReLU -> MaxPool2d(2x2): Output Block1(32 x 140 x 98)

▼

Conv2d(32 → 64, kernel=3, padding=1) -> ReLu -> MaxPool2d(2x2): Output Block2(64 x 70 x 49)

▼

Conv2d(64 → 128, kernel=3, padding=1) -> ReLu -> MaxPool2d(2x2): Output Block3(128 x 35 x 24)

▼

Conv2d(128 → 256, kernel=3, padding=1) -> ReLu -> MaxPool2d(2x2): Output Block3(256 x 17 x 12)

▼

Conv2d(256 → 380, kernel=3, padding=1) -> ReLu -> MaxPool2d(2x2): Output Block3(380 x 8 x 6)

▼

Flatten (380x8x6 = 18,240 features)

▼

Linear Layer 1: 18,240 → 256 -> ReLu -> Dropout

▼

Linear Layer 2: 256 → 90 logits

▼

Output (90 logits → one per animal class)

In [9]:
class SuperDeepCNN(nn.Module): 
    def __init__(self,num_classes=90): 
        super().__init__()
        
        #feature extraction
        self.conv1=nn.Conv2d(3,32,kernel_size=3,padding=1) #1st convolution: 3 in channels (RGB), 32 out channels ("pattern maps"), kernel size of 3, padding of 1 [(3-1)//2]
        self.conv2=nn.Conv2d(32,64,kernel_size=3,padding=1) #2nd convolution: 32 in channels, 64 out channels, kernel size of 3
        self.conv3=nn.Conv2d(64,128,kernel_size=3,padding=1) #3rd convolution: 64 in channels, 128 out channels, kernel size of 3
        self.conv4=nn.Conv2d(128,256,kernel_size=3,padding=1) #4th convolution: 128 in channels, 256 out channels, kernel size of 3
        self.conv5=nn.Conv2d(256,380,kernel_size=3,padding=1) #5th convolution: 256 in channels, 380 out channels, kernel size of 3 (don't want to huge of a jump or too many features)
        self.pool=nn.MaxPool2d(kernel_size=2) #max pool: kernel size of 2 (every 2x2 region becomes 1 pixel, halving both width and height)
        self.dropout=nn.Dropout(0.05) #dropout to reduce overfitting (10% chance to dropout, since small dataset)
        
        #classifying
        self.fc1=nn.Linear(380*8*6,256) #1st linear layer: input -> 256 channels (after adaptive avg pool), output -> 256 "hidden features" [fc: fully connected layer->every input node connected to every output node]
        self.fc2=nn.Linear(256,num_classes) #2nd linear layer: input -> 256 "hidden features", output -> 90 desired animal classes
        
    def forward(self,x): #x->input batch
        x=self.pool(F.relu(self.conv1(x))) #Block 1: Conv1->ReLu->Pool->Output (32,140,98)
        x=self.pool(F.relu(self.conv2(x))) #Block 2: Conv2->ReLu->Pool->Output (64,70,49)
        x=self.pool(F.relu(self.conv3(x))) #Block 3: Conv3->ReLu->Pool->Output (128,35,24)
        x=self.pool(F.relu(self.conv4(x))) #Block 4: Conv4->ReLu->Pool->Output (256,17,12)
        x=self.pool(F.relu(self.conv5(x))) #Block 5: Conv5->ReLu->Pool->Output (380,8,6)
        x=x.view(x.size(0),-1) #Flatten: (batch_size,380*8*6) -> ready for linear layers
        x=self.dropout(F.relu(self.fc1(x))) #fc1 -> ReLu -> Dropout -> compressed to 256 hidden features
        x=self.fc2(x) #fc2 -> output 90 logits (one per animal class)
        
        return x #return logits (class "scores"), logits fed into loss function in training

In [None]:
#train inputs
model=SuperDeepCNN(num_classes=90).to(device)
criterion=nn.CrossEntropyLoss() #loss function for multi-class classification
optimizer=torch.optim.Adam(model.parameters(),lr=3e-4)
train_size=int(0.7*len(dataset)) 
val_size=int(0.15*len(dataset))
test_size=len(dataset)-train_size-val_size
train_dataset,val_dataset,test_dataset=random_split(dataset,[train_size,val_size,test_size])
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=16,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)
epochs=15 
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau( #'scheduler'
    optimizer,
    mode='min',
    factor=0.5,
    patience=2,
    verbose=True
)

train_model(model,train_loader,val_loader,test_loader,optimizer,criterion,device,epochs,scheduler)

Epoch 01 | Train Loss: 4.4958 | Val Loss: 4.4652 | LR: 3.00e-04
Epoch 02 | Train Loss: 4.3076 | Val Loss: 4.1547 | LR: 3.00e-04
Epoch 03 | Train Loss: 3.9464 | Val Loss: 3.9209 | LR: 3.00e-04
Epoch 04 | Train Loss: 3.5479 | Val Loss: 3.8328 | LR: 3.00e-04
Epoch 05 | Train Loss: 3.0380 | Val Loss: 3.6415 | LR: 3.00e-04
Epoch 06 | Train Loss: 2.4450 | Val Loss: 3.6924 | LR: 3.00e-04
Epoch 07 | Train Loss: 1.7482 | Val Loss: 3.8234 | LR: 3.00e-04
Epoch 08 | Train Loss: 1.1708 | Val Loss: 4.4452 | LR: 1.50e-04
Epoch 09 | Train Loss: 0.5387 | Val Loss: 5.3855 | LR: 1.50e-04
Epoch 10 | Train Loss: 0.3060 | Val Loss: 5.9011 | LR: 1.50e-04
Epoch 11 | Train Loss: 0.1815 | Val Loss: 6.8939 | LR: 7.50e-05
Epoch 12 | Train Loss: 0.1105 | Val Loss: 7.3357 | LR: 7.50e-05
Epoch 13 | Train Loss: 0.0882 | Val Loss: 7.8356 | LR: 7.50e-05
Epoch 14 | Train Loss: 0.0691 | Val Loss: 8.1619 | LR: 3.75e-05
Epoch 15 | Train Loss: 0.0611 | Val Loss: 7.9583 | LR: 3.75e-05


'Test accuracy: 33.05%'

Performance didn't improve, so this pretty much confirms that the core issue is not depth but generalization. Also looks like dropout actually hurt our model's performance, probably due to the dataset being really small. Of course it is difficult to match ResNet18 because we have a tiny dataset for training CNNs from scratch, but we can try to implement some features that will improve generalization: 

- Global average pooling (forces generalization by removing spatial dependence-averaging each feature map spatially, and avoid massive flattening and fully-connected layers)
- Batch normalization (stabilizing feature scales)
- Edited the channel growth pattern (to somewhat match  ResNet)
- Removing dropout (hurt performance previously)

We'll call it "Modern" CNN because it implements some modern techniques rather than just the simple CNN we started off with. Final caveat: GAP seems to learn slow, we need way more epochs...

**SUPER DEEP MODEL**

Input (3 x 280 x 196) 

▼

Conv2d(3 → 64, kernel=3, padding=1) -> BatchNorm -> ReLU -> MaxPool2d(2x2): Output Block1(64 x 140 x 98)

▼

Conv2d(64 → 128, kernel=3, padding=1) -> BatchNorm -> ReLu -> MaxPool2d(2x2): Output Block2(128 x 70 x 49)

▼

Conv2d(128 → 256, kernel=3, padding=1) -> BatchNorm -> ReLu -> MaxPool2d(2x2): Output Block3(256 x 35 x 24)

▼

Conv2d(256 → 512, kernel=3, padding=1) -> BatchNorm -> ReLu -> MaxPool2d(2x2): Output Block3(512 x 17 x 12)

▼

Global Average Pooling -> Output (512x1x1)

▼

Flatten (512 features)

▼

Linear Layer: 512 → 90 logits

▼

Output (90 logits → one per animal class)

In [6]:
class ModernCNN(nn.Module):
    def __init__(self,num_classes=90):
        super().__init__()

        def block(in_ch,out_ch):
            return nn.Sequential(
                nn.Conv2d(in_ch,out_ch,3,padding=1,bias=False),
                nn.BatchNorm2d(out_ch),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(2)
            )
            
        self.features=nn.Sequential(
            block(3,64),
            block(64,128),
            block(128,256),
            block(256,512)
        )

        self.classifier=nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(512,num_classes)
        )

    def forward(self,x):
        x=self.features(x)
        return self.classifier(x)


In [13]:
#train inputs
model=ModernCNN(num_classes=90).to(device)
criterion=nn.CrossEntropyLoss() #loss function for multi-class classification
optimizer=torch.optim.Adam(model.parameters(),lr=3e-4)
train_size=int(0.7*len(dataset)) 
val_size=int(0.15*len(dataset))
test_size=len(dataset)-train_size-val_size
train_dataset,val_dataset,test_dataset=random_split(dataset,[train_size,val_size,test_size])
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=16,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)
epochs=40 #more epochs
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau( #'scheduler'
    optimizer,
    mode='min',
    factor=0.5,
    patience=2,
    verbose=True
)

train_model(model,train_loader,val_loader,test_loader,optimizer,criterion,device,epochs,scheduler)

Epoch 01 | Train Loss: 4.1837 | Val Loss: 3.9282 | LR: 3.00e-04
Epoch 02 | Train Loss: 3.8571 | Val Loss: 3.7651 | LR: 3.00e-04
Epoch 03 | Train Loss: 3.6904 | Val Loss: 3.7018 | LR: 3.00e-04
Epoch 04 | Train Loss: 3.5422 | Val Loss: 3.6323 | LR: 3.00e-04
Epoch 05 | Train Loss: 3.4368 | Val Loss: 3.5228 | LR: 3.00e-04
Epoch 06 | Train Loss: 3.3396 | Val Loss: 3.5625 | LR: 3.00e-04
Epoch 07 | Train Loss: 3.2361 | Val Loss: 3.4189 | LR: 3.00e-04
Epoch 08 | Train Loss: 3.1725 | Val Loss: 3.4354 | LR: 3.00e-04
Epoch 09 | Train Loss: 3.1026 | Val Loss: 3.2923 | LR: 3.00e-04
Epoch 10 | Train Loss: 3.0074 | Val Loss: 3.3124 | LR: 3.00e-04
Epoch 11 | Train Loss: 2.9826 | Val Loss: 3.2374 | LR: 3.00e-04
Epoch 12 | Train Loss: 2.9025 | Val Loss: 3.2409 | LR: 3.00e-04
Epoch 13 | Train Loss: 2.8304 | Val Loss: 3.1916 | LR: 3.00e-04
Epoch 14 | Train Loss: 2.7811 | Val Loss: 3.1398 | LR: 3.00e-04
Epoch 15 | Train Loss: 2.7344 | Val Loss: 3.1976 | LR: 3.00e-04
Epoch 16 | Train Loss: 2.6854 | Val Loss

'Test accuracy: 44.51%'

Clearly a huge improvement, couple final touches: 
- add additional random changes to training input images, known as data augmentation 
- push to 70 epochs, appears train & val loss are still steadily going down i.e the model is still learning at 40 epochs (a bit)

In [12]:
train_transform=transforms.Compose([ #augmentation
    transforms.RandomResizedCrop( #random crops of part of images
        size=(280,196),
        scale=(0.8,1.0),
        ratio=(0.9,1.1)
    ),
    transforms.RandomHorizontalFlip(p=0.4), #random horizontal flip
    transforms.ColorJitter( #
        brightness=0.25,
        contrast=0.25,
        saturation=0.25,
        hue=0.05
    ),
    transforms.RandomRotation(degrees=10), #random rotations
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.5391,0.5254,0.4752],
        std=[0.2933,0.2845,0.3052]
    )
])


val_transform=transforms.Compose([ #no augmentation
    transforms.Resize((280,196)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5391,0.5254,0.4752],std=[0.2933,0.2845,0.3052])
])

In [8]:
#train inputs
model=ModernCNN(num_classes=90).to(device)
criterion=nn.CrossEntropyLoss() #loss function for multi-class classification
optimizer=torch.optim.Adam(model.parameters(),lr=3e-4)
imgs=ImageFolder(root='data/animals')
train_size=int(0.7*len(imgs))
val_size=int(0.15*len(imgs))
test_size=len(imgs)-train_size-val_size
train_dataset,val_dataset,test_dataset=random_split(imgs,[train_size,val_size,test_size])
train_dataset.dataset.transform=train_transform #augmentation
val_dataset.dataset.transform=val_transform #no augmentation
test_dataset.dataset.transform=val_transform #no augmentation
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=16,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)
epochs=70 #more more epochs
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau( #'scheduler'
    optimizer,
    mode='min',
    factor=0.5,
    patience=2,
    verbose=True
)

train_model(model,train_loader,val_loader,test_loader,optimizer,criterion,device,epochs,scheduler)



Epoch 01 | Train Loss: 4.1970 | Val Loss: 3.9432 | LR: 3.00e-04
Epoch 02 | Train Loss: 3.8873 | Val Loss: 3.7987 | LR: 3.00e-04
Epoch 03 | Train Loss: 3.7190 | Val Loss: 3.7511 | LR: 3.00e-04
Epoch 04 | Train Loss: 3.5743 | Val Loss: 3.6096 | LR: 3.00e-04
Epoch 05 | Train Loss: 3.4452 | Val Loss: 3.5503 | LR: 3.00e-04
Epoch 06 | Train Loss: 3.3538 | Val Loss: 3.4450 | LR: 3.00e-04
Epoch 07 | Train Loss: 3.2656 | Val Loss: 3.3704 | LR: 3.00e-04
Epoch 08 | Train Loss: 3.2156 | Val Loss: 3.3501 | LR: 3.00e-04
Epoch 09 | Train Loss: 3.1299 | Val Loss: 3.2460 | LR: 3.00e-04
Epoch 10 | Train Loss: 3.0317 | Val Loss: 3.3797 | LR: 3.00e-04
Epoch 11 | Train Loss: 2.9818 | Val Loss: 3.3305 | LR: 3.00e-04
Epoch 12 | Train Loss: 2.9306 | Val Loss: 3.1517 | LR: 3.00e-04
Epoch 13 | Train Loss: 2.8827 | Val Loss: 3.1437 | LR: 3.00e-04
Epoch 14 | Train Loss: 2.8216 | Val Loss: 3.1311 | LR: 3.00e-04
Epoch 15 | Train Loss: 2.7695 | Val Loss: 3.1209 | LR: 3.00e-04
Epoch 16 | Train Loss: 2.6871 | Val Loss

'Test accuracy: 47.10%'

Hmm, looks like learning is still very healthy but we are plateauing in test accuracy. Seems we are in the diminishing returns territory: yes more epochs leads to slightly better accuracy, but the tradeoff is probably not worth the computational cost and time. Last thing we'll add to try and replicate ResNet18: residual connections. These are meant to make optimization easier by allowing the model to learn small improvements or do nothing (if can't improve, just pass through unchanged). 

**RESIDUAL CNN MODEL**

Input (3 x 280 x 196) 

▼

Residual Block 1: 
Conv2d(3 → 64, kernel=3, padding=1) -> BatchNorm -> ReLU -> Conv2d(64->64, kernel=3, padding=1) -> BatchNorm
- skip connection from input -> ReLU -> MaxPool2d(2x2) -> OutputBlock1: (64x140x98)

▼

Residual Block 2: 
Conv2d(64 → 128, kernel=3, padding=1) -> BatchNorm -> ReLU -> Conv2d(128->128, kernel=3, padding=1) -> BatchNorm
- skip connection from input -> ReLU -> MaxPool2d(2x2) -> OutputBlock1: (128x70x49)

▼

Residual Block 3: 
Conv2d(128 → 256, kernel=3, padding=1) -> BatchNorm -> ReLU -> Conv2d(256->256, kernel=3, padding=1) -> BatchNorm
- skip connection from input -> ReLU -> MaxPool2d(2x2) -> OutputBlock1: (256x35x24)
▼

Residual Block 4: 
Conv2d(256 → 512, kernel=3, padding=1) -> BatchNorm -> ReLU -> Conv2d(512->512, kernel=3, padding=1) -> BatchNorm
- skip connection from input -> ReLU -> MaxPool2d(2x2) -> OutputBlock1: (512x17x12)
▼

Global Average Pooling -> Output (512x1x1)

▼

Flatten (512 features)

▼

Linear Layer: 512 → 90 logits

▼

Output (90 logits → one per animal class)

In [11]:
class ResBlock(nn.Module): 
    def __init__(self,in_ch,out_ch): 
        super().__init__()
        
        self.conv=nn.Sequential(
            nn.Conv2d(in_ch,out_ch,3,padding=1,bias=False),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_ch,out_ch,3,padding=1,bias=False), #second convolution now in each block to refine features before adding skip connection
            nn.BatchNorm2d(out_ch)
        )
        
        self.skip=nn.Identity() #skip path returns input unchanged
        if in_ch!=out_ch: #if number of channels change
            self.skip=nn.Conv2d(in_ch,out_ch,1,bias=False) #use 1x1 convolution to change channel count to keep spatial size the same
        self.relu=nn.ReLU(inplace=True) #activation applied after combining learned features with input
        self.pool=nn.MaxPool2d(2) #downsample spatial size by 2 after residual addition
        
    def forward(self,x): 
        out=self.conv(x) #run input through main convolutional path
        skip=self.skip(x) #run input through skip connection 
        out=out+skip #residual additionL out=out+skip
        out=self.relu(out) #apply non-linearity after combining
        out=self.pool(out) #downsample
        return out

class ResCNN(nn.Module):
    def __init__(self,num_classes=90):
        super().__init__()
            
        self.features=nn.Sequential(
            ResBlock(3,64),
            ResBlock(64,128),
            ResBlock(128,256),
            ResBlock(256,512)
        )

        self.classifier=nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(512,num_classes)
        )

    def forward(self,x):
        x=self.features(x)
        return self.classifier(x)

In [13]:
#train inputs
model=ResCNN(num_classes=90).to(device)
criterion=nn.CrossEntropyLoss() #loss function for multi-class classification
optimizer=torch.optim.Adam(model.parameters(),lr=5e-4) #slightly increase LR to start
imgs=ImageFolder(root='data/animals')
train_size=int(0.7*len(imgs))
val_size=int(0.15*len(imgs))
test_size=len(imgs)-train_size-val_size
train_dataset,val_dataset,test_dataset=random_split(imgs,[train_size,val_size,test_size])
train_dataset.dataset.transform=train_transform #augmentation
val_dataset.dataset.transform=val_transform #no augmentation
test_dataset.dataset.transform=val_transform #no augmentation
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=16,shuffle=False)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)
epochs=70 #more more epochs
scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau( #'scheduler'
    optimizer,
    mode='min',
    factor=0.5,
    patience=2,
    verbose=True
)

train_model(model,train_loader,val_loader,test_loader,optimizer,criterion,device,epochs,scheduler)



Epoch 01 | Train Loss: 4.2715 | Val Loss: 4.1918 | LR: 5.00e-04
Epoch 02 | Train Loss: 3.9739 | Val Loss: 3.9381 | LR: 5.00e-04
Epoch 03 | Train Loss: 3.8254 | Val Loss: 3.8254 | LR: 5.00e-04
Epoch 04 | Train Loss: 3.6688 | Val Loss: 3.7668 | LR: 5.00e-04
Epoch 05 | Train Loss: 3.5388 | Val Loss: 3.6260 | LR: 5.00e-04
Epoch 06 | Train Loss: 3.4335 | Val Loss: 3.5542 | LR: 5.00e-04
Epoch 07 | Train Loss: 3.3093 | Val Loss: 3.5257 | LR: 5.00e-04
Epoch 08 | Train Loss: 3.1909 | Val Loss: 3.3854 | LR: 5.00e-04
Epoch 09 | Train Loss: 3.0546 | Val Loss: 3.3859 | LR: 5.00e-04
Epoch 10 | Train Loss: 2.9657 | Val Loss: 3.2277 | LR: 5.00e-04
Epoch 11 | Train Loss: 2.8593 | Val Loss: 3.3798 | LR: 5.00e-04
Epoch 12 | Train Loss: 2.7415 | Val Loss: 3.1699 | LR: 5.00e-04
Epoch 13 | Train Loss: 2.6596 | Val Loss: 3.1426 | LR: 5.00e-04
Epoch 14 | Train Loss: 2.5601 | Val Loss: 3.1145 | LR: 5.00e-04
Epoch 15 | Train Loss: 2.4611 | Val Loss: 3.0294 | LR: 5.00e-04
Epoch 16 | Train Loss: 2.3907 | Val Loss

'Test accuracy: 56.23%'

Considering our severe limitations here: 

- Small dataset
- Building model from scratch 

This is pretty good! 

In [14]:
torch.save(model.state_dict(),"ResCNN.pth")