## 2. Train Object Classifier

**Model to identify causal relationships**: 
feature extraction network (ResNet18) trained on ImageNet and 
classifier network (two 512-unit hidden layers) trained on Pascal VOC 2012

### 2-1. Pascal VOC2012 Dataset

In [6]:
from torchvision.datasets import VOCDetection
from torchvision import transforms as T
from torch.utils.data import DataLoader

In [88]:
# object categories in pascal voc
categories = [
    'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 
    'dining table', 'dog', 'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa', 'train', 'television'
]

In [5]:
transform = T.Compose([T.ToTensor()])
dataset = VOCDetection('data/voc', image_set='train', download=True, transform=transform)
dataloader = DataLoader(dataset)

Using downloaded and verified file: data/voc\VOCtrainval_11-May-2012.tar
Extracting data/voc\VOCtrainval_11-May-2012.tar to data/voc


### 2-2. Feature (ResNet) Classifier Training

In [19]:
import os
import json
import numpy as np
import torch
from torch import nn
from torchvision.models import resnet18
from torch import optim

In [13]:
def feature_extractor():
    model_ft = resnet18(pretrained=True)
    # finetune
    for param in model_ft.parameters():
        param.requires_grad = False
    # modify classifier
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Flatten()
    # features = model_ft._modules.get('avgpool')
    return model_ft

In [14]:
class Classifier(nn.Module):
    #  train on Pascal VOC 2012 dataset!!
    def __init__(self, in_features=512, hidden_dim=512):
        super(Classifier, self).__init__()
        self.layer1 = nn.Linear(in_features, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, 20)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.classifier(x)
        # softmax
        return x

In [15]:
# torch.save(classifier.state_dict(), 'results/object_classifier.pt')

In [17]:
# train classifier (w/ feature extractor) on Pascal VOC2012
batch_size = 1
num_epoch = 1
device = 'cpu'

In [20]:
resnet = feature_extractor().to(device)
classifier = Classifier().to(device)

resnet.eval()
classifier.train()

criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(classifier.parameters())

for image, target in dataloader:
    objs, bboxs = [], []
    for obj in target['annotation']['object']:
        objs.append(obj['name'])
        bboxs.append(obj['bndbox'])

    features = resnet(image)
    logodds = classifier(features)

    optimizer.zero_grad()
    loss = criterion(logodds, target)
    loss.backward()
    optimizer.step()
    break

TypeError: cross_entropy_loss(): argument 'target' (position 2) must be Tensor, not dict

In [23]:
image.shape

torch.Size([1, 3, 442, 500])

['horse'] {'xmin': ['53'], 'ymin': ['87'], 'xmax': ['471'], 'ymax': ['420']}
['person'] {'xmin': ['158'], 'ymin': ['44'], 'xmax': ['289'], 'ymax': ['167']}


In [30]:
obj

{'name': ['person'],
 'pose': ['Unspecified'],
 'truncated': ['1'],
 'occluded': ['0'],
 'bndbox': {'xmin': ['158'], 'ymin': ['44'], 'xmax': ['289'], 'ymax': ['167']},
 'difficult': ['0']}