# ModelNet Classification with RotationNet

In [None]:
import torch
import wandb
import numpy as np
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

## Preprocessing
#### Transformations

In [None]:
from torchvision.transforms import Compose, ToTensor, Normalize

transforms = Compose([
    ToTensor(),
    # normalization for pretrained networks 
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

#### Manual shuffling
this function is used to shuffle the training dataset while maintaining the views of an object together

In [None]:
def shuffle_dataloader(dataloader, num_views):

    num_samples = len(dataloader.dataset.imgs)//num_views

    # initializing vector of indices first row will correspond to indices of firts view of each element
    # second row to second view and so on
    indices = np.zeros( ( num_views, num_samples ) ).astype('int')
    indices[0] = np.random.permutation(num_samples) * num_views

    # adding following indices to the rows
    for i in range(1,num_views):
        indices[i] = indices[0] + i

    # flattening the vector along its columns       
    indices = indices.flatten(order='F')

    #applying the modifications to the dataloader directly
    dataloader.dataset.imgs = [dataloader.dataset.imgs[i] for i in indices]
    dataloader.dataset.samples = dataloader.dataset.imgs

#### Datasets & Dataloaders

In [None]:
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# setting dataset to use and version (1 w upright 2 w/o upright)
num_classes = 40
version = "2"

# setting number of views for each object (differs from versions)
if version=="1":
    num_views = 12
    viewpoint_candidate = np.load('vcand_case1.npy')
else:
    num_views = 20
    viewpoint_candidate = np.load('vcand_case2.npy')

# creating ImageFolder Datasets
train_dataset = ImageFolder(f'../content/datasets/ModelNet{num_classes}_png_v{version}/train', transforms)
test_dataset = ImageFolder(f'../content/datasets/ModelNet{num_classes}_png_v{version}/test', transforms)

# in version 1 validation set is not present since we only have a small subset of ModelNet so we are going to use test set also as validation
if version=="1":
    validation_dataset = ImageFolder(f'../content/datasets/ModelNet{num_classes}_png_v{version}/test', transforms)
else:
    validation_dataset = ImageFolder(f'../content/datasets/ModelNet{num_classes}_png_v{version}/validation', transforms)

# dataloaders
train_dataloader = DataLoader(train_dataset, 10*num_views, shuffle=False,  num_workers=0)
validation_dataloader = DataLoader(validation_dataset, 10*num_views, shuffle=False, num_workers=0)
test_dataloader = DataLoader(test_dataset, 10*num_views, shuffle=False, num_workers=0)

## Definition of the neural networks

### VGG16 (Configuration D)
Implementration of the VGG16 architecture for the first part of the pipeline  
We decided to leave the code since it was done but we do not use it since it was too time consuming to train from scratch

In [None]:
from torch.nn import Module, Sequential, Conv2d, BatchNorm2d, ReLU, MaxPool2d, Linear, Dropout

class VGG16_bn_manual(Module):
    
    def __init__(self, num_classes):
        super().__init__()
        self.network = Sequential(
            
            # Definition of first block

            # first layer
            Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=64),
            ReLU(),
            # second layer
            Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=64),
            ReLU(),
            # Maxpool_1
            MaxPool2d(kernel_size = 2, stride = 2),
        
            # Definition of second block

            # third layer
            Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=128),
            ReLU(),
            # fourth layer
            Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=128),
            ReLU(),

            # Maxpool_2
            MaxPool2d(kernel_size = 2, stride = 2),

            # Definition of third block

            # fifth layer
            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=256),
            ReLU(),
            # sixth layer
            Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=256),
            ReLU(),
            # seventh layer
            Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=256),
            ReLU(),

            # Maxpool_3
            MaxPool2d(kernel_size = 2, stride = 2),

            # Definition of fourth block

            # eighth layer
            Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=512),
            ReLU(),
            # ninth layer
            Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=512),
            ReLU(),
            # tenth layer
            Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=512),
            ReLU(),

            # Maxpool_4
            MaxPool2d(kernel_size = 2, stride = 2),

            # Definition of fifth block

            # eleventh layer
            Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=512),
            ReLU(),
            # twelfth layer
            Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=512),
            ReLU(),
            # thirteenth layer
            Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
            BatchNorm2d(num_features=512),
            ReLU(),

            # Maxpool_5
            MaxPool2d(kernel_size = 2, stride = 2)
        )

        # Classification Layer
        self.classifier = Sequential(
                Dropout(),
                Linear(7*7*512, 4096),
                ReLU(),
                Dropout(),
                Linear(4096, 4096),
                ReLU(),
                Linear(4096, num_classes),
            )   
        self.apply(self._init_weights)

    def forward(self, x):
        x = self.network(x).reshape((x.shape[0], -1)) # added flatten layer
        x = self.classifier(x)
        # returns the logits
        return x

    def _init_weights(self, module):
        if isinstance(module, torch.nn.Conv2d) or isinstance(module, torch.nn.Linear):
            torch.nn.init.kaiming_normal_(module.weight, nonlinearity='relu')
            if module.bias is not None:
                module.bias.data.zero_()

### VGG16 (Configuration D) Pretrained
used to fine-tune the VGG16

In [None]:
from torch.nn import Module, Sequential, ReLU,  Linear, Dropout
import torchvision.models as models
class VGG_16_pretrained(Module):
    def __init__(self, num_classes):
        super(VGG_16_pretrained, self).__init__()
        vgg16 = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)        
        
        self.features = vgg16.features
        self.classifier = Sequential(
            Dropout(),
            Linear(7*7*512, 4096),
            ReLU(),
            Dropout(),
            Linear(4096, 4096),
            ReLU(),
            Linear(4096, num_classes),
        )
        # Freezing Pretrained Weigths
        for layer in self.features.parameters():
            layer.requires_grad = False


    def forward(self, x):
        x = self.features(x).reshape((x.shape[0], -1)) # added flatten layer
        x = self.classifier(x)
        return x

## Training and Testing Functions
#### Function used to calculate loss and accuracy on validation and test sets

In [None]:
from tqdm import tqdm
from torch.nn import CrossEntropyLoss
from torch.nn.functional import log_softmax

def calculate_accuracy(dataloader, network, num_classes, num_views, viewpoint_candidate):

    # verifying if cuda is available
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # creating iterator
    iterator = tqdm(dataloader)

    # putting the network on evaluation mode
    network.eval()

    with torch.no_grad():
        labels = []
        outputs = []

        for batch_data, batch_labels in iterator:

            # moving batch to device
            batch_data = batch_data.to(device)
            batch_labels = batch_labels.to(device)        
            
            # forward pass
            batch_output = network(batch_data)

            # adding labels and logits predictions to array
            outputs.append(batch_output)
            labels.append(batch_labels)

        # concatenating to obtain a single tensor
        labels = torch.cat(labels, axis=0) 
        outputs = torch.cat(outputs, axis=0)

        # removing multiple labels due to multiple views
        labels= labels[0:-1:num_views]

        # reshaping the output
        outputs = outputs.reshape(-1, num_classes + 1 )

        # using softmax to obtain probabilities then the logarithm to simplify calculations (sum instead of division)
        log_probs = log_softmax( outputs, dim = 1)

        # calculating last equation of eq.5 in RotationNet paper
        log_p_vi_N_1 = log_probs[ :, -1 ].unsqueeze(1).expand(-1 , num_classes)
        log_p_vi_y = log_probs[ :, :-1 ]
        summation = log_p_vi_y - log_p_vi_N_1 #called summation because using logarithms to calculate the division

        # reshaping and conversion (necessary to work with numpy) for score calculations with candidate views
        summation = summation.reshape( -1, num_views*num_views, num_classes )
        summation = summation.cpu().numpy().transpose( 1, 2, 0 )
        
        # initializing scores for all views 
        scores = np.zeros( ( viewpoint_candidate.shape[0], num_classes, labels.shape[0]))

        # initializing scores for best view namely eq.6 in RotationNet paper
        best_scores = torch.zeros( ( labels.shape[0], num_classes))

        # calculating scores from eq.6 in RotationNet paper
        # summing the scores since we are using the logarithms this still need argmax
        for i in range(viewpoint_candidate.shape[0]):
            for j in range(viewpoint_candidate.shape[1]):
                scores[i] = scores[i] + summation[viewpoint_candidate[i][j]*num_views + j]
        
        # computing argmax to find best viewpoint and considering savinng its scores as the best one 
        for i in range(labels.shape[0]):
            # dividing by num_classes is needed to find the starting index of the best viewpoint 
            best_viewpoint_idx = np.argmax( scores[ :, :, i] ) // num_classes
            best_scores[ i ] = torch.FloatTensor( scores[ best_viewpoint_idx, :, i ] )
        
        # finding best class prediction
        pred = torch.argmax(best_scores, dim=1).to(device)
        
        # calculating accuracy
        accuracy = torch.sum(pred == labels)/len(labels)    
    return  accuracy

#### Training epoch
This function will do a single epoch of the training phase

In [None]:
def training_epoch(dataloader, network, loss_fn, optimizer, num_classes, num_views, viewpoint_candidate):
    
    # verifying if cuda is available
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # creating iterator
    iterator = tqdm(dataloader)

    # putting the network on training mode
    network.train()
    
    for batch_data, batch_labels in iterator:
                
        # definyng number of samples
        num_samples = batch_data.size(0) // num_views

        # moving batch to device
        batch_data = batch_data.to(device)
        batch_labels = batch_labels.to(device)        
        
        # forward pass
        batch_outputs = network(batch_data)

        # reshaping the output
        batch_outputs = batch_outputs.reshape(-1, num_classes + 1 )

        # using softmax to obtain probabilities then the logarithm to simplify calculations (sum instead of division)
        log_probs = log_softmax( batch_outputs, dim = 1)

        # calculating last equation of eq.5 in RotationNet paper
        log_p_vi_N_1 = log_probs[ :, -1 ].unsqueeze(1).expand(-1 , num_classes)
        log_p_vi_y = log_probs[ :, :-1 ]
        summation = log_p_vi_y - log_p_vi_N_1 #called summation because using logarithms to calculate the division

        # reshaping and conversion (necessary to work with numpy) for score calculations with candidate views
        summation = summation.reshape( -1, num_views*num_views, num_classes )
        summation = summation.data.cpu().numpy().transpose( 1, 2, 0 )

        # initializing modified labels(with viewpoint) initially set to incorrect view
        mod_labels = torch.LongTensor( batch_labels.shape[0] * num_views )    
        for i in range(mod_labels.shape[0]):
            # since normal label [0,num_classes-1] incorrect view will be num_classes
            mod_labels[i] = num_classes 
        
        # initializing scores for all views 
        scores = np.zeros( ( viewpoint_candidate.shape[0], num_classes, num_samples))

        # calculating scores from eq.6 in RotationNet paper
        # summing the scores since we are using the logarithms this stil need argmax
        for i in range(viewpoint_candidate.shape[0]):
            for j in range(viewpoint_candidate.shape[1]):
                scores[i] = scores[i] + summation[viewpoint_candidate[i][j]*num_views + j]
        # computing argmax to find best viewpoint and considering savinng its scores as the best one 
        for i in range(num_samples):                        
            best_viewpoint_class_idx = np.argmax( scores[ :, batch_labels[ i * num_views ], i ] )
            for j in range(viewpoint_candidate.shape[1]):
                    mod_labels[ i * num_views * num_views + viewpoint_candidate[ best_viewpoint_class_idx ][ j ] * num_views + j ] = batch_labels[ i * num_views ]

        # moving modified labels to device to have everything on the same one
        mod_labels = mod_labels.to(device)

        # compute loss
        train_loss = loss_fn(batch_outputs, mod_labels)

        # logging training loss
        wandb.log({"train_loss": train_loss})
        
        # bacward pass
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # viewing batch results
        iterator.set_description(f"Train loss: {train_loss.detach().cpu().numpy()}")

#### Training Loop
This function will execute the entire training loop

In [None]:
from torch.optim import SGD

def training_Loop(network, train_dl, val_dl, epochs, num_classes, num_views, viewpoint_candidate):
    
    # defining loss functions
    loss_fn = CrossEntropyLoss()

    # using optimizer only on finetunable parameters (with manual model on all parameters)
    optimizer = SGD(filter(lambda p: p.requires_grad, network.parameters()), lr=1e-2, momentum=0.90, weight_decay=1e-4)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    network.to(device)

    #best_val_loss = np.inf
    best_val_acc = 0

    # iterating through the epochs
    for epoch in range(epochs):
        print('\nEpoch: '+str(epoch))

        # using variable learning rate too few epochs to be effective
        #if (epoch % 40)==0 :
            #lr = 1e-2 * (0.1 ** (epoch // 40))
            #print(f"\nSetting learning rate to: {lr}\n")  
            #for param_group in optimizer.param_groups:
                #param_group['lr'] = lr

        # TRAINING
        print("TRAINING PHASE")

        # shuffilng train dataloader
        shuffle_dataloader(train_dl, num_views)

        # executing a training epoch
        training_epoch(train_dl, network, loss_fn, optimizer, num_classes, num_views, viewpoint_candidate)

        # VALIDATION
        print("VALIDATION PHASE")
     
        # calculating loss and accuracy
        val_acc = calculate_accuracy(val_dl, network, num_classes, num_views, viewpoint_candidate)
        
        # logging validation results
        wandb.log({"epoch": epoch,
                   "validation_accuracy": val_acc,                
                   "validation_error": 1-val_acc})

        # visualizing validation results
        print(f"Validation accuracy: {val_acc.detach().cpu().numpy()}")       

        # updating and saving the best model
        if val_acc >= best_val_acc:
            print("Saved Model")
            best_val_acc = val_acc
            torch.save(network.state_dict(), "model.pt")

## Training and Testing
#### Initilizing Data Logging

In [None]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="RotationNet_Modelnet_40",
    
    # track hyperparameters and run metadata
    config={
    "dataset": "ModelNet40_case_2",
    "epochs": 60,
    }
)
wandb.define_metric("epoch")

wandb.define_metric("validation_accuracy", step_metric="epoch")
wandb.define_metric("validation_error", step_metric="epoch")

#### Starting training loop and testing

In [None]:
model = VGG_16_pretrained((num_classes+1)*num_views)
epochs = 60

training_Loop(model, train_dataloader, validation_dataloader, epochs, num_classes, num_views, viewpoint_candidate)
# closing logging at the end of the run
wandb.finish()
# testing
model.load_state_dict(torch.load("model.pt"))

val_accuracy = calculate_accuracy(validation_dataloader, model, num_classes, num_views, viewpoint_candidate)
print("Validation Accuracy : "+str(val_accuracy))

test_accuracy = calculate_accuracy(test_dataloader, model, num_classes, num_views, viewpoint_candidate)
print("Test Accuracy : "+str(test_accuracy))