## 2. Train Object Classifier

**Model to identify causal relationships**: 
feature extraction network (ResNet18) trained on ImageNet and 
classifier network (two 512-unit hidden layers) trained on Pascal VOC 2012

### 2-1. Pascal VOC2012 Dataset

In [1]:
import os
import json
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from torchvision.models import resnet18
from torchvision.datasets import VOCDetection
from torchvision import transforms as T

In [2]:
# object categories in pascal voc
categories = [
    'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 
    'dining table', 'dog', 'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa', 'train', 'television'
]
categories = {categories[i]: i for i in range(len(categories))}

In [4]:
transform = T.Compose([T.Resize((224,224)), T.ToTensor()])
dataset = VOCDetection('data/voc', image_set='train', download=True, transform=transform)
dataloader = DataLoader(dataset)

Using downloaded and verified file: data/voc\VOCtrainval_11-May-2012.tar
Extracting data/voc\VOCtrainval_11-May-2012.tar to data/voc


### 2-2. Feature (ResNet) Classifier Training

In [5]:
def feature_extractor():
    model_ft = resnet18(pretrained=True)
    # finetune
    for param in model_ft.parameters():
        param.requires_grad = False
    # modify classifier
    num_ftrs = model_ft.fc.in_features
    model_ft.fc = nn.Flatten()
    # features = model_ft._modules.get('avgpool')
    return model_ft

In [6]:
class Classifier(nn.Module):
    #  train on Pascal VOC 2012 dataset!!
    def __init__(self, in_features=512, hidden_dim=512):
        super(Classifier, self).__init__()
        self.layer1 = nn.Linear(in_features, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, 20)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.classifier(x)
        # softmax
        return x

In [7]:
# train classifier (w/ feature extractor) on Pascal VOC2012
batch_size = 1
num_epoch = 1
device = 'cpu'

In [9]:
resnet = feature_extractor().to(device)
classifier = Classifier().to(device)
# torch.save(classifier.state_dict(), 'results/object_classifier.pt')

resize_transform = T.Compose([T.Resize((224,224)), T.ToTensor()])

resnet.eval()
classifier.train()

criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(classifier.parameters())

for image, anns in dataloader:
    target, target_names = [], []
    for obj in anns['annotation']['object']:
        target.append(categories[obj['name'][0]])
        target_names.append(obj['name'][0])
        # bboxs.append(obj['bndbox'])

    target = torch.tensor(target)
    print(target_names)
    # target = F.one_hot(torch.tensor(target), num_classes=20)
    
    # num_objs = len(bboxs)
    # outputs = torch.empty((0, 20))
    
    # for i in range(num_objs):
    #     objectImg = image.clone().detach()

    #     bbox = bboxs[i]
    #     mask = torch.zeros(image.shape[1:], dtype=int)
    #     x1, x2 = int(bbox['xmin'][0]), int(bbox['xmax'][0])
    #     y1, y2 = int(bbox['ymax'][0]), int(bbox['ymax'][0])
    #     mask[x1:x2+1,y1:y2+1] = 1
        
        
    #     mask = torch.tensor(mask>0, dtype=torch.uint8)
        
    #     objectImg = torch.where(mask>0, objectImg, mask.float())
    #     objectImg = T.ToPILImage()(objectImg[0])
        # print(objectImg.shape)
        # objectImg = resize_transform(objectImg)
    #     objectImg = objectImg.unsqueeze(0)
        
    features = resnet(image)
    logodds = classifier(features)

    #     outputs = torch.cat((outputs, logodds))

    # optimizer.zero_grad()
    # loss = criterion(outputs, target)
    # loss.backward()
    # optimizer.step()

    break

['horse', 'person']


In [10]:
# objectImg = T.ToPILImage()(objectImg)
objectImg

NameError: name 'objectImg' is not defined