## 2. Train Object Classifier

**Model to identify causal relationships**: 
feature extraction network (ResNet18) trained on ImageNet and 
classifier network (two 512-unit hidden layers) trained on Pascal VOC 2012

### 2-1. Pascal VOC2012 Dataset

In [1]:
import os
import json
import tqdm
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from torchvision.models import resnet18
from torchvision.datasets import VOCDetection
from torchvision import transforms as T

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# object categories in pascal voc
categories = [
    'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 
    'dining table', 'dog', 'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa', 'train', 'television'
]
categories = {categories[i]: i for i in range(len(categories))}
categories['tvmonitor'] = categories['television']
categories['diningtable'] = categories['dining table']
categories['pottedplant'] = categories['potted plant']

In [3]:
transform = T.Compose([T.Resize((224,224)), T.ToTensor()])
dataset = VOCDetection('data/voc', image_set='train', download=True, transform=transform)

Using downloaded and verified file: data/voc\VOCtrainval_11-May-2012.tar
Extracting data/voc\VOCtrainval_11-May-2012.tar to data/voc


### 2-2. Feature (ResNet) Classifier Training

In [4]:
def feature_extractor():
    model_ft = resnet18(pretrained=True)
    # finetune
    for param in model_ft.parameters():
        param.requires_grad = False
    # modify classifier
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Flatten()
    # features = model_ft._modules.get('avgpool')
    return model_ft

In [5]:
class Classifier(nn.Module):
    #  train on Pascal VOC 2012 dataset!!
    def __init__(self, in_features=512, hidden_dim=512):
        super(Classifier, self).__init__()
        self.layer1 = nn.Linear(in_features, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, 20)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.classifier(x)
        # softmax
        return x

In [8]:
# train classifier (w/ feature extractor) on Pascal VOC2012
batch_size = 16
num_epoch = 10
device = 'cpu'

dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=lambda x: x)

In [9]:
resnet = feature_extractor().to(device)
classifier = Classifier().to(device)
# torch.save(classifier.state_dict(), 'results/object_classifier.pt')

resnet.eval()
classifier.train()

criterion = nn.MultiLabelSoftMarginLoss()
optimizer = optim.RMSprop(classifier.parameters())

for epoch in tqdm.tqdm(range(num_epoch)):
    for batch in tqdm.tqdm(dataloader, leave=False):
        image_batch = torch.empty((len(batch), 3, 224, 224))
        target_batch = torch.empty((len(batch), 20))
        for idx, (image, anns) in enumerate(batch):
            image_batch[idx] = image

            target, target_names = [], []
            for obj in anns['annotation']['object']:
                target.append(categories[obj['name']])
                target_names.append(obj['name'][0])
                # bboxs.append(obj['bndbox'])
            target_batch[idx] = F.one_hot(torch.tensor(target), num_classes=20).sum(axis=0).reshape(1,-1)
        
        image_batch, target_batch = image_batch.to(device), target_batch.to(device)
        features = resnet(image_batch)
        logodds = classifier(features)

        optimizer.zero_grad()
        loss = criterion(logodds, target_batch)
        loss.backward()
        optimizer.step()

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
100%|██████████| 10/10 [56:14<00:00, 337.41s/it]


In [10]:
torch.save(classifier.state_dict(), f'results/object_classifier_{num_epoch}.pt')