In [None]:
#Clone this repo and then move the notebook to that folder and run it

!git clone https://github.com/d-li14/mobilenetv3.pytorch

In [1]:
import time
from mobilenetv3 import mobilenetv3_large,mobilenetv3_small
import h5py
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torchvision.models as models
from torch.autograd import Variable
from tqdm import tqdm
import os
import torch.utils.data as data
import torchvision.transforms as transforms
from PIL import Image
import re
import json
import torch.nn.functional as F
import torch.nn.init as init
from torch.nn.utils.rnn import pack_padded_sequence
import itertools
import json
import os
from collections import Counter
from itertools import takewhile
from pprint import pprint
from datetime import datetime

cuda = torch.cuda.is_available()

In [2]:
cuda

True

In [3]:
device = torch.device("cuda" if cuda else "cpu")

In [4]:
IMG_EXTENSIONS = [
    '.jpg', '.JPG', '.jpeg', '.JPEG',
    '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
]


def is_image_file(filename):
    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)


class ImageDataset(data.Dataset):

    def __init__(self, path, transform=None):
        self.path = path
        self.transform = transform

        # Load the paths to the images available in the folder
        self.image_names = self._load_img_paths()

        if len(self.image_names) == 0:
            raise (RuntimeError("Found 0 images in " + path + "\n"
                                                              "Supported image extensions are: " + ",".join(
                IMG_EXTENSIONS)))
        else:
            print('Found {} images in {}'.format(len(self), self.path))

    def __getitem__(self, index):
        item = {}
        item['name'] = self.image_names[index]
        item['path'] = os.path.join(self.path, item['name'])

        # Use PIL to load the image
        item['visual'] = Image.open(item['path']).convert('RGB')
        if self.transform is not None:
            item['visual'] = self.transform(item['visual'])

        return item

    def __len__(self):
        return len(self.image_names)

    def _load_img_paths(self):
        images = []
        for name in os.listdir(self.path):
            if is_image_file(name):
                images.append(name)
        return images

In [5]:
def get_transform(img_size):
    return transforms.Compose([
        transforms.Resize(img_size),
        transforms.CenterCrop(img_size),
        transforms.ToTensor(),
        # TODO : Compute mean and std of VizWiz
        # ImageNet normalization
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

In [6]:
class NetFeatureExtractor(nn.Module):

    def __init__(self):
        super(NetFeatureExtractor, self).__init__()
        self.model = mobilenetv3_large()
        self.model.classifier[0] = nn.Sequential()
        self.model.classifier[1] = nn.Linear(960,1280)
        self.model.classifier[3] = nn.Sequential()
        self.model.classifier[5] = nn.Linear(1280,1000)
        # Input of size 224x224x3.
        self.model.load_state_dict(torch.load('pretrained/mobilenetv3-large-657e7b3d.pth'))

        # Save attention features (tensor)
        def save_att_features(module, input, output):
            self.att_feat = output

        # Save no-attention features (vector)
        def save_noatt_features(module, input, output):
            self.no_att_feat = output

        # This is a forward hook. Is executed each time forward is executed
        self.model.conv.register_forward_hook(save_att_features)
        self.model.avgpool.register_forward_hook(save_noatt_features)

    def forward(self, x):
        self.model(x)
        return self.no_att_feat, self.att_feat  # [batch_size, 960], [batch_size, 960, 7, 7]


def image_feature_extractor_train():
    # Benchmark mode is good whenever your input sizes for your network do not vary
    cudnn.benchmark = True

    net = NetFeatureExtractor().cuda()
    net.eval()
    # Resize, Crop, Normalize
    transform = get_transform(224)
    dataset = ImageDataset("train", transform=transform)

    data_loader = torch.utils.data.DataLoader(dataset,batch_size=4,num_workers=4,shuffle=False,pin_memory=True)

    h5_file = h5py.File("mobile_net_train.h5", 'w')

    dummy_input = Variable(torch.ones(1, 3, 224, 224), volatile=True).cuda()
    _, dummy_output = net(dummy_input)

    att_features_shape = (len(data_loader.dataset),dummy_output.size(1),dummy_output.size(2),dummy_output.size(3))

    noatt_features_shape = (len(data_loader.dataset),dummy_output.size(1))

    h5_att = h5_file.create_dataset('att', shape=att_features_shape, dtype='float16')
    h5_noatt = h5_file.create_dataset('noatt', shape=noatt_features_shape, dtype='float16')

    # save order of extraction
    dt = h5py.special_dtype(vlen=str)
    img_names = h5_file.create_dataset('img_name', shape=(len(data_loader.dataset),), dtype=dt)

    begin = time.time()
    end = time.time()

    print('Extracting features ...')
    idx = 0
    delta = 4

    for i, inputs in enumerate(tqdm(data_loader)):
        with torch.no_grad():
            inputs_img = Variable(inputs['visual']).cuda()
            no_att_feat, att_feat = net(inputs_img)

            # reshape (batch_size, 960)
            no_att_feat = no_att_feat.view(-1, 960)

            h5_noatt[idx:idx + delta] = no_att_feat.data.cpu().numpy().astype('float16')
            h5_att[idx:idx + delta, :, :] = att_feat.data.cpu().numpy().astype('float16')
            img_names[idx:idx + delta] = inputs['name']

            idx += delta
    h5_file.close()

    end = time.time() - begin

    print('Finished in {}m and {}s'.format(int(end / 60), int(end % 60)))
    print('Created file ')
    
def image_feature_extractor_val():
    # Benchmark mode is good whenever your input sizes for your network do not vary
    cudnn.benchmark = True

    net = NetFeatureExtractor().cuda()
    net.eval()
    # Resize, Crop, Normalize
    transform = get_transform(224)
    dataset = ImageDataset("val", transform=transform)

    data_loader = torch.utils.data.DataLoader(dataset,batch_size=4,num_workers=4,shuffle=False,pin_memory=True)

    h5_file = h5py.File("mobile_net_val.h5", 'w')

    dummy_input = Variable(torch.ones(1, 3, 224, 224), volatile=True).cuda()
    _, dummy_output = net(dummy_input)

    att_features_shape = (len(data_loader.dataset),dummy_output.size(1),dummy_output.size(2),dummy_output.size(3))

    noatt_features_shape = (len(data_loader.dataset),dummy_output.size(1))

    h5_att = h5_file.create_dataset('att', shape=att_features_shape, dtype='float16')
    h5_noatt = h5_file.create_dataset('noatt', shape=noatt_features_shape, dtype='float16')

    # save order of extraction
    dt = h5py.special_dtype(vlen=str)
    img_names = h5_file.create_dataset('img_name', shape=(len(data_loader.dataset),), dtype=dt)

    begin = time.time()
    end = time.time()

    print('Extracting features ...')
    idx = 0
    delta = 4

    for i, inputs in enumerate(tqdm(data_loader)):
        with torch.no_grad():
            inputs_img = Variable(inputs['visual']).cuda()
            no_att_feat, att_feat = net(inputs_img)

            # reshape (batch_size, 960)
            no_att_feat = no_att_feat.view(-1, 960)

            h5_noatt[idx:idx + delta] = no_att_feat.data.cpu().numpy().astype('float16')
            h5_att[idx:idx + delta, :, :] = att_feat.data.cpu().numpy().astype('float16')
            img_names[idx:idx + delta] = inputs['name']

            idx += delta
    h5_file.close()

    end = time.time() - begin

    print('Finished in {}m and {}s'.format(int(end / 60), int(end % 60)))
    print('Created file : ')
    
def image_feature_extractor_test():
    # Benchmark mode is good whenever your input sizes for your network do not vary
    cudnn.benchmark = True

    net = NetFeatureExtractor().cuda()
    net.eval()
    # Resize, Crop, Normalize
    transform = get_transform(224)
    dataset = ImageDataset("test", transform=transform)

    data_loader = torch.utils.data.DataLoader(dataset,batch_size=4,num_workers=4,shuffle=False,pin_memory=True)

    h5_file = h5py.File("mobile_net_test.h5", 'w')

    dummy_input = Variable(torch.ones(1, 3, 224, 224), volatile=True).cuda()
    _, dummy_output = net(dummy_input)

    att_features_shape = (len(data_loader.dataset),dummy_output.size(1),dummy_output.size(2),dummy_output.size(3))

    noatt_features_shape = (len(data_loader.dataset),dummy_output.size(1))

    h5_att = h5_file.create_dataset('att', shape=att_features_shape, dtype='float16')
    h5_noatt = h5_file.create_dataset('noatt', shape=noatt_features_shape, dtype='float16')

    # save order of extraction
    dt = h5py.special_dtype(vlen=str)
    img_names = h5_file.create_dataset('img_name', shape=(len(data_loader.dataset),), dtype=dt)

    begin = time.time()
    end = time.time()

    print('Extracting features ...')
    idx = 0
    delta = 4

    for i, inputs in enumerate(tqdm(data_loader)):
        with torch.no_grad():
            inputs_img = Variable(inputs['visual']).cuda()
            no_att_feat, att_feat = net(inputs_img)

            # reshape (batch_size, 960)
            no_att_feat = no_att_feat.view(-1, 960)

            h5_noatt[idx:idx + delta] = no_att_feat.data.cpu().numpy().astype('float16')
            h5_att[idx:idx + delta, :, :] = att_feat.data.cpu().numpy().astype('float16')
            img_names[idx:idx + delta] = inputs['name']

            idx += delta
    h5_file.close()

    end = time.time() - begin

    print('Finished in {}m and {}s'.format(int(end / 60), int(end % 60)))
    print('Created file : ')

In [7]:
def prepare_questions(annotations):
    """ Filter, Normalize and Tokenize question. """

    prepared = []
    questions = [q['question'] for q in annotations]

    for question in questions:
        # lower case
        question = question.lower()

        # define desired replacements here
        punctuation_dict = {'.': ' ', "'": '', '?': ' ', '_': ' ', '-': ' ', '/': ' ', ',': ' '}
        conversational_dict = {"thank you": '', "thanks": '', "thank": '', "please": '', "hello": '',
                               "hi ": ' ', "hey ": ' ', "good morning": '', "good afternoon": '', "have a nice day": '',
                               "okay": '', "goodbye": ''}

        rep = punctuation_dict
        rep.update(conversational_dict)

        # use these three lines to do the replacement
        rep = dict((re.escape(k), v) for k, v in rep.items())
        pattern = re.compile("|".join(rep.keys()))
        question = pattern.sub(lambda m: rep[re.escape(m.group(0))], question)

        # sentence to list
        question = question.split(' ')

        # remove empty strings
        question = list(filter(None, question))

        prepared.append(question)

    return prepared

In [8]:
def prepare_answers(annotations):
    answers = [[a['answer'] for a in ans_dict['answers']] for ans_dict in annotations]
    prepared = []

    for sample_answers in answers:
        prepared_sample_answers = []
        for answer in sample_answers:
            # lower case
            answer = answer.lower()

            # define desired replacements here
            punctuation_dict = {'.': ' ', "'": '', '?': ' ', '_': ' ', '-': ' ', '/': ' ', ',': ' '}

            rep = punctuation_dict
            rep = dict((re.escape(k), v) for k, v in rep.items())
            pattern = re.compile("|".join(rep.keys()))
            answer = pattern.sub(lambda m: rep[re.escape(m.group(0))], answer)
            prepared_sample_answers.append(answer)

        prepared.append(prepared_sample_answers)
    return prepared

In [9]:
def create_question_vocab(questions, min_count=0):
    """
    Extract vocabulary used to tokenize and encode questions.
    """
    words = itertools.chain.from_iterable([q for q in questions])  # chain('ABC', 'DEF') --> A B C D E F
    counter = Counter(words)

    counted_words = counter.most_common()
    # select only the words appearing at least min_count
    selected_words = list(takewhile(lambda x: x[1] >= min_count, counted_words))

    vocab = {t[0]: i for i, t in enumerate(selected_words, start=1)}

    return vocab


def create_answer_vocab(annotations, top_k):
    answers = itertools.chain.from_iterable(prepare_answers(annotations))

    counter = Counter(answers)
    counted_ans = counter.most_common(top_k)
    # start from labels from 0
    vocab = {t[0]: i for i, t in enumerate(counted_ans, start=0)}

    return vocab

In [21]:
def question_answer_vocabulary(mode = "train"):
    # Load annotations
    dir_path = "Annotations"

    # vocabs are created based on train (trainval) split only
    train_path = os.path.join(dir_path, mode + '.json')
    with open(train_path, 'r',encoding = 'utf-8') as fd:
        train_ann = json.load(fd)

    questions = prepare_questions(train_ann)

    question_vocab = create_question_vocab(questions, 0)
    answer_vocab = create_answer_vocab(train_ann, 3000)

    # Save pre-processing vocabs
    vocabs = {
        'question': question_vocab,
        'answer': answer_vocab,
    }

    with open("vocabs.json", 'w') as fd:
        json.dump(vocabs, fd)

    print("vocabs saved in {}".format("vocabs.json"))

In [11]:
def encode_question(question, token_to_index, max_length):
    question_vec = torch.zeros(max_length).long()
    length = min(len(question), max_length)
    for i in range(length):
        token = question[i]
        index = token_to_index.get(token, 0)
        question_vec[i] = index
    # empty encoded questions are a problem when packed,
    # if we set min length 1 we feed a 0 token to the RNN
    # that is not a problem since the token 0 does not represent a word
    return question_vec, max(length, 1)


def encode_answers(answers, answer_to_index):
    answer_vec = torch.zeros(len(answer_to_index))
    for answer in answers:
        index = answer_to_index.get(answer)
        if index is not None:
            answer_vec[index] += 1
    return answer_vec

In [12]:
class FeaturesDataset(data.Dataset):

    def __init__(self, features_path, mode):
        self.path_hdf5 = features_path

        assert os.path.isfile(self.path_hdf5), \
            'File not found in {}, you must extract the features first with images_preprocessing.py'.format(
                self.path_hdf5)

        self.hdf5_file = h5py.File(self.path_hdf5, 'r')
        self.dataset_features = self.hdf5_file[mode]  # noatt or att (attention)

    def __getitem__(self, index):
        return torch.from_numpy(self.dataset_features[index].astype('float32'))

    def __len__(self):
        return self.dataset_features.shape[0]

In [25]:
class VQADataset(data.Dataset):
    """ VQA dataset, open-ended """

    def __init__(self, split,h5_path):
        super(VQADataset, self).__init__()

        with open("vocabs.json", 'r',encoding = 'utf-8') as fd:
            vocabs = json.load(fd)

        annotations_dir = "Annotations"

        path_ann = os.path.join(annotations_dir, split + ".json")
        with open(path_ann, 'r',encoding = 'utf-8') as fd:
            self.annotations = json.load(fd)

        self.max_question_length = 26
        self.split = split

        # vocab
        self.vocabs = vocabs
        self.token_to_index = self.vocabs['question']
        self.answer_to_index = self.vocabs['answer']

        # pre-process questions and answers
        self.questions = prepare_questions(self.annotations)
        self.questions = [encode_question(q, self.token_to_index, self.max_question_length) for q in
                          self.questions]  # encode questions and return question and question lenght

        if self.split != 'test':
            self.answers = prepare_answers(self.annotations)
            self.answers = [encode_answers(a, self.answer_to_index) for a in
                            self.answers]  # create a sparse vector of len(self.answer_to_index) for each question containing the occurances of each answer

        if self.split == "train" or self.split == "trainval":
            self._filter_unanswerable_samples()

        # load image names in feature extraction order
        with h5py.File(h5_path, 'r') as f:
            img_names = f['img_name'][()]
        self.name_to_id = {name: i for i, name in enumerate(img_names)}

        # names in the annotations, will be used to get items from the dataset
        self.img_names = [s['image'] for s in self.annotations]
        # load features
        self.features = FeaturesDataset(h5_path, "att")

    def _filter_unanswerable_samples(self):
        """
        Filter during training the samples that do not have at least one answer
        """
        a = []
        q = []
        annotations = []
        for i in range(len(self.answers)):
            if len(self.answers[i].nonzero()) > 0:
                a.append(self.answers[i])
                q.append(self.questions[i])

                annotations.append(self.annotations[i])
        self.answers = a
        self.questions = q
        self.annotations = annotations

    @property
    def num_tokens(self):
        return len(self.token_to_index) + 1  # add 1 for <unknown> token at index 0

    def __getitem__(self, i):

        item = {}
        item['question'], item['q_length'] = self.questions[i]
        if self.split != 'test':
            item['answer'] = self.answers[i]
        img_name = self.img_names[i]
        feature_id = self.name_to_id[img_name]
        item['img_name'] = self.img_names[i]
        item['visual'] = self.features[feature_id]
        # collate_fn sorts the samples in order to be possible to pack them later in the model.
        # the sample_id is returned so that the original order can be restored during when evaluating the predictions
        item['sample_id'] = i

        return item

    def __len__(self):
        return len(self.questions)

In [14]:
def collate_fn(batch):
    # Sort samples in the batch based on the question lengths in descending order.
    # This allows to pack the pack_padded_sequence when encoding questions using RNN
    batch.sort(key=lambda x: x['q_length'], reverse=True)
    return data.dataloader.default_collate(batch)

def get_loader(split,h5_path):
    """ Returns the data loader of the specified dataset split """
    dataset = VQADataset(split,h5_path)

    loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=128,
        shuffle=True if split == 'train' or split == 'trainval' else False,  # only shuffle the data in training
        pin_memory=True,
        num_workers=4,
        collate_fn=collate_fn,
    )
    return loader

In [15]:
class Model(nn.Module):
    """
    References :
     1 - https://arxiv.org/abs/1704.03162
     2 - https://arxiv.org/pdf/1511.02274
     3 - https://arxiv.org/abs/1708.00584
    """

    def __init__(self, num_tokens):
        super(Model, self).__init__()

        dim_v = 960
        dim_q = 1024
        dim_h = 1024

        n_glimpses = 2

        self.text = TextEncoder(
            num_tokens=num_tokens,
            emb_size=300,
            dim_q=dim_q,
            drop=0.25,
        )
        self.attention = Attention(
            dim_v=dim_v,
            dim_q=dim_q,
            dim_h=512,
            n_glimpses=n_glimpses,
            drop=0.5,
        )
        self.classifier = Classifier(
            dim_input=n_glimpses * dim_v + dim_q,
            dim_h=dim_h,
            top_ans=3000,
            drop=0.5,
        )

        for m in self.modules():
            if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
                init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, v, q, q_len):

        q = self.text(q, list(q_len.data))
        # L2 normalization on the depth dimension
        v = F.normalize(v, p=2, dim=1)
        attention_maps = self.attention(v, q)
        v = apply_attention(v, attention_maps)
        # concatenate attended features and encoded question
        combined = torch.cat([v, q], dim=1)
        answer = self.classifier(combined)
        return answer


class Classifier(nn.Sequential):
    def __init__(self, dim_input, dim_h, top_ans, drop=0.0):
        super(Classifier, self).__init__()
        self.add_module('drop1', nn.Dropout(drop))
        self.add_module('lin1', nn.Linear(dim_input, dim_h))
        self.add_module('relu', nn.ReLU())
        self.add_module('drop2', nn.Dropout(drop))
        self.add_module('lin2', nn.Linear(dim_h, top_ans))


class TextEncoder(nn.Module):
    def __init__(self, num_tokens, emb_size, dim_q, drop=0.0):
        super(TextEncoder, self).__init__()
        self.embedding = nn.Embedding(num_tokens, emb_size, padding_idx=0)
        self.dropout = nn.Dropout(drop)
        self.tanh = nn.Tanh()
        self.lstm = nn.LSTM(input_size=emb_size,
                            hidden_size=dim_q,
                            num_layers=1)
        self.dim_q = dim_q

        # Initialize parameters
        self._init_lstm(self.lstm.weight_ih_l0)
        self._init_lstm(self.lstm.weight_hh_l0)
        self.lstm.bias_ih_l0.data.zero_()
        self.lstm.bias_hh_l0.data.zero_()

        init.xavier_uniform_(self.embedding.weight)

    def _init_lstm(self, weight):
        for w in weight.chunk(4, 0):
            init.xavier_uniform_(w)

    def forward(self, q, q_len):
        embedded = self.embedding(q)
        tanhed = self.tanh(self.dropout(embedded))
        # pack to feed to the LSTM
        packed = pack_padded_sequence(tanhed, q_len, batch_first=True)
        _, (h, _) = self.lstm(packed)
        # _, (_, c) = self.lstm(packed)
        return h.squeeze(0)


class Attention(nn.Module):
    def __init__(self, dim_v, dim_q, dim_h, n_glimpses, drop=0.0):
        super(Attention, self).__init__()
        # As specified in https://arxiv.org/pdf/1511.02274.pdf the bias is already included in fc_q
        self.conv_v = nn.Conv2d(dim_v, dim_h, 1, bias=False)
        self.fc_q = nn.Linear(dim_q, dim_h)
        self.conv_x = nn.Conv2d(dim_h, n_glimpses, 1)

        self.dropout = nn.Dropout(drop)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, v, q):
        # bring to the same shape
        v = self.conv_v(self.dropout(v))
        q = self.fc_q(self.dropout(q))
        q = repeat_encoded_question(q, v)
        # sum element-wise and ReLU
        x = self.relu(v + q)

        x = self.conv_x(self.dropout(x))  # We obtain n_glimpses attention maps [batch_size][n_glimpses][14][14]
        return x


def repeat_encoded_question(q, v):
    """
    Repeat the encoded question over all the spatial positions of the input image feature tensor.
    :param q: shape [batch_size][h]
    :param v: shape [batch_size][h][7][7]
    :return: a tensor constructed repeating q 7x7 with shape [batch_size][h][7][7]
    """
    batch_size, h = q.size()
    # repeat the encoded question [14x14] times (over all the spatial positions of the image feature matrix)
    q_tensor = q.view(batch_size, h, *([1, 1])).expand_as(v)
    return q_tensor


def apply_attention(v, attention):
    """
    Apply attention maps over the input image features.
    """
    batch_size, spatial_vec_size = v.size()[:2]
    glimpses = attention.size(1)

    # flatten the spatial dimensions [7x7] into a third dimension [49]
    v = v.view(batch_size, spatial_vec_size, -1)
    attention = attention.view(batch_size, glimpses, -1)
    n_image_regions = v.size(2)  # 7x7 = 49

    # Apply softmax to each attention map separately to create n_glimpses attention distribution over the image regions
    attention = attention.view(batch_size * glimpses, -1)  # [batch_size x n_glimpses][49]
    attention = F.softmax(attention, dim=1)

    # apply the weighting by creating a new dim to tile both tensors over
    target_size = [batch_size, glimpses, spatial_vec_size, n_image_regions]
    v = v.view(batch_size, 1, spatial_vec_size, n_image_regions).expand(
        *target_size)  # [batch_size][n_glimpses][960[49]
    attention = attention.view(batch_size, glimpses, 1, n_image_regions).expand(
        *target_size)  # [batch_size][n_glimpses][960][49]
    # Weighted sum over all the spatial regions vectors
    weighted = v * attention
    weighted_mean = weighted.sum(dim=3)  # [batch_size][n_glimpses][960]

    # attended features are flattened in the same dimension
    return weighted_mean.view(batch_size, -1)  # [batch_size][n_glimpses * 960]

In [16]:
def vqa_accuracy(predicted, true):
    """ Approximation of VQA accuracy metric """
    _, predicted_index = predicted.max(dim=1, keepdim=True)
    agreeing = true.gather(dim=1, index=predicted_index)
    return (agreeing * 0.33333).clamp(max=1)  # * 0.33333 is a good approximation of the VQA metric


class Tracker:

    def __init__(self):
        self.data = {}

    def track(self, name, *monitors):
        l = Tracker.ListStorage(monitors)
        self.data.setdefault(name, []).append(l)
        return l

    def to_dict(self):
        return {k: list(map(list, v)) for k, v in self.data.items()}

    class ListStorage:
        def __init__(self, monitors=[]):
            self.data = []
            self.monitors = monitors
            for monitor in self.monitors:
                setattr(self, monitor.name, monitor)

        def append(self, item):
            for monitor in self.monitors:
                monitor.update(item)
            self.data.append(item)

        def __iter__(self):
            return iter(self.data)

    class MeanMonitor:
        name = 'mean'

        def __init__(self):
            self.n = 0
            self.total = 0

        def update(self, value):
            self.total += value
            self.n += 1

        @property
        def value(self):
            return self.total / self.n

    class MovingMeanMonitor:
        name = 'mean'

        def __init__(self, momentum=0.9):
            self.momentum = momentum
            self.first = True
            self.value = None

        def update(self, value):
            if self.first:
                self.value = value
                self.first = False
            else:
                m = self.momentum
                self.value = m * self.value + (1 - m) * value


def get_id_from_name(name):
    import re

    n = re.search('VizWiz_(.+?)_', name)
    if n:
        split = n.group(1)

    m = re.search(('VizWiz_%s_(.+?).jpg' % split), name)
    if m:
        found = m.group(1)

    return int(found)

In [19]:
#Creating image features file
image_feature_extractor_train()

#Create question and answer vocabulary
question_answer_vocabulary(mode = "train")

Found 23954 images in train


  0%|          | 0/5989 [00:00<?, ?it/s]

Extracting features ...


100%|██████████| 5989/5989 [06:10<00:00, 16.16it/s]


Finished in 6m and 10s
Created file 
vocabs saved in vocabs.json


In [22]:
question_answer_vocabulary(mode = "train")

vocabs saved in vocabs.json


In [20]:
image_feature_extractor_val()
image_feature_extractor_test()

  0%|          | 0/1938 [00:00<?, ?it/s]

Found 7750 images in val
Extracting features ...


100%|██████████| 1938/1938 [01:52<00:00, 17.20it/s]
  0%|          | 0/2000 [00:00<?, ?it/s]

Finished in 1m and 52s
Created file : 
Found 8000 images in test
Extracting features ...


100%|██████████| 2000/2000 [02:04<00:00, 16.01it/s]

Finished in 2m and 4s
Created file : 





In [17]:
#Training
def train(model, loader, optimizer, tracker, epoch, split):
    model.train()

    tracker_class, tracker_params = tracker.MovingMeanMonitor, {'momentum': 0.99}
    tq = tqdm(loader, desc='{} E{:03d}'.format(split, epoch), ncols=0)
    loss_tracker = tracker.track('{}_loss'.format(split), tracker_class(**tracker_params))
    acc_tracker = tracker.track('{}_acc'.format(split), tracker_class(**tracker_params))
    log_softmax = nn.LogSoftmax(dim=1).cuda()

    for item in tq:
        v = item['visual']
        q = item['question']
        a = item['answer']
        q_length = item['q_length']

        v = Variable(v.cuda())
        q = Variable(q.cuda())
        a = Variable(a.cuda())
        q_length = Variable(q_length.cuda())

        out = model(v, q, q_length)

        # This is the Soft-loss described in https://arxiv.org/pdf/1708.00584.pdf

        nll = -log_softmax(out)

        loss = (nll * a / 10).sum(dim=1).mean()
        acc = vqa_accuracy(out.data, a.data).cpu()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss_tracker.append(loss.item())
        acc_tracker.append(acc.mean())
        fmt = '{:.4f}'.format
        tq.set_postfix(loss=fmt(loss_tracker.mean.value), acc=fmt(acc_tracker.mean.value))

In [18]:
#Evaluating
def evaluate(model, loader, tracker, epoch, split):
    model.eval()
    tracker_class, tracker_params = tracker.MeanMonitor, {}

    predictions = []
    samples_ids = []
    accuracies = []

    tq = tqdm(loader, desc='{} E{:03d}'.format(split, epoch), ncols=0)
    loss_tracker = tracker.track('{}_loss'.format(split), tracker_class(**tracker_params))
    acc_tracker = tracker.track('{}_acc'.format(split), tracker_class(**tracker_params))
    log_softmax = nn.LogSoftmax(dim=1).cuda()

    with torch.no_grad():
        for item in tq:
            v = item['visual']
            q = item['question']
            a = item['answer']
            sample_id = item['sample_id']
            q_length = item['q_length']

            v = Variable(v.cuda())
            q = Variable(q.cuda())
            a = Variable(a.cuda())
            q_length = Variable(q_length.cuda())

            out = model(v, q, q_length)

            # This is the Soft-loss described in https://arxiv.org/pdf/1708.00584.pdf

            nll = -log_softmax(out)

            loss = (nll * a / 10).sum(dim=1).mean()
            acc = vqa_accuracy(out.data, a.data).cpu()

            # save predictions of this batch
            _, answer = out.data.cpu().max(dim=1)

            predictions.append(answer.view(-1))
            accuracies.append(acc.view(-1))
            # Sample id is necessary to obtain the mapping sample-prediction
            samples_ids.append(sample_id.view(-1).clone())

            loss_tracker.append(loss.item())
            acc_tracker.append(acc.mean())
            fmt = '{:.4f}'.format
            tq.set_postfix(loss=fmt(loss_tracker.mean.value), acc=fmt(acc_tracker.mean.value))

        predictions = list(torch.cat(predictions, dim=0))
        accuracies = list(torch.cat(accuracies, dim=0))
        samples_ids = list(torch.cat(samples_ids, dim=0))

    eval_results = {
        'answers': predictions,
        'accuracies': accuracies,
        'samples_ids': samples_ids,
        'avg_accuracy': acc_tracker.mean.value,
        'avg_loss': loss_tracker.mean.value
    }

    return eval_results

In [27]:
#Data Loader
train_loader = get_loader("train","mobile_net_train.h5")
val_loader = get_loader("val","mobile_net_val.h5")

#Model
model = Model(train_loader.dataset.num_tokens)
model = model.to(device)

#Optimizer
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),lr=0.001)

#Tracker
tracker = Tracker()

In [28]:
min_loss = 10
max_accuracy = 0

In [32]:
path_log_dir = "logs"

if not os.path.exists(path_log_dir):
    os.makedirs(path_log_dir)

print('Model logs will be saved in {}'.format(path_log_dir))

path_best_accuracy = os.path.join(path_log_dir, 'best_accuracy_log.pth')
path_best_loss = os.path.join(path_log_dir, 'best_loss_log.pth')

Model logs will be saved in logs


In [33]:
#Training
for i in range(10):
    train(model, train_loader, optimizer, tracker, epoch=i, split='train')
    # If we are training on the train split (and not on train+val) we can evaluate on val
    split='train'
    if split == 'train':
        eval_results = evaluate(model, val_loader, tracker, epoch=i, split='val')

        # save all the information in the log file
        log_data = {
            'epoch': i,
            'tracker': tracker.to_dict(),
            'weights': model.state_dict(),
            'eval_results': eval_results,
            'vocabs': train_loader.dataset.vocabs,
        }

        # save logs for min validation loss and max validation accuracy
        if eval_results['avg_loss'] < min_loss:
            torch.save(log_data, path_best_loss)  # save model
            min_loss = eval_results['avg_loss']  # update min loss value

        if eval_results['avg_accuracy'] > max_accuracy:
            torch.save(log_data, path_best_accuracy)  # save model
            max_accuracy = eval_results['avg_accuracy']  # update max accuracy value

# Save final model
log_data = {
    'tracker': tracker.to_dict(),
    'weights': model.state_dict(),
    'vocabs': train_loader.dataset.vocabs,
}

path_final_log = os.path.join(path_log_dir, 'final_log.pth')
torch.save(log_data, path_final_log)

train E000:   0% 0/156 [00:00<?, ?it/s]

TypeError: h5py objects cannot be pickled