### Modeling DAQUAR
* [Dataset](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/vision-and-language/visual-turing-challenge)

* [Original Paper](chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://proceedings.neurips.cc/paper_files/paper/2014/file/d516b13671a4179d9b7b458a6ebdeb92-Paper.pdf)

### Setup Environment:

In [1]:
import os
import pandas as pd

from src.classifiers import process_labels, split_data
from src.classifiers_base import preprocess_df

from transformers import BertTokenizer

from src.multimodal_data_loader import VQADataset
from torch.utils.data import DataLoader

from src.classifiers_base import train_early_fusion, train_late_fusion

In [2]:
PATH = 'datasets/coco-qa/'

In [3]:
text_path = os.path.join(PATH, 'labels.csv')
images_path = os.path.join(PATH, 'images')

## Get data

In [4]:
df = pd.read_csv(text_path)
df

Unnamed: 0,questions,image_id,answers,types,split
0,what is using umbrellas as a central theme,397899,sculpture,0,train
1,what walks toward the rope perimeter fence,310683,elephant,0,train
2,what is the color of the horses,23004,brown,2,train
3,where is the black cat laying down,117931,sink,3,train
4,what is the color of the character,220218,purple,2,train
...,...,...,...,...,...
117679,what are there grouped together here,406426,vegetables,0,test
117680,what serves as the train trestle,545581,bridge,0,test
117681,what is the color of the plate,40404,white,2,test
117682,what is sleeping on the blue couch,570521,cat,0,test


## Data Perparation

In [5]:
# Select features and labels vectors
text_columns = 'questions'
image_columns = 'image_id'
label_columns = 'answers'

df = preprocess_df(df, image_columns, images_path)

# Split the data
train_df, test_df = split_data(df)

# Process and one-hot encode labels for training set
train_labels, mlb, train_columns = process_labels(train_df, col=label_columns)
test_labels = process_labels(test_df, col=label_columns, train_columns=train_columns)

100%|██████████| 117684/117684 [01:25<00:00, 1369.39it/s]
100%|██████████| 117684/117684 [01:04<00:00, 1820.34it/s]


Train Shape: (78736, 5)
Test Shape: (38948, 5)


In [6]:
train_df

Unnamed: 0,questions,image_id,answers,types,split
0,what is using umbrellas as a central theme,datasets/coco-qa/images/000000397899.jpg,sculpture,0,train
1,what walks toward the rope perimeter fence,datasets/coco-qa/images/000000310683.jpg,elephant,0,train
2,what is the color of the horses,datasets/coco-qa/images/000000023004.jpg,brown,2,train
3,where is the black cat laying down,datasets/coco-qa/images/000000117931.jpg,sink,3,train
4,what is the color of the character,datasets/coco-qa/images/000000220218.jpg,purple,2,train
...,...,...,...,...,...
78731,where are diced meat and tomatoes mixed with c...,datasets/coco-qa/images/000000111606.jpg,bowl,3,train
78732,what is parked at the airport and loading people,datasets/coco-qa/images/000000443687.jpg,airplane,0,train
78733,what cut into two with soup,datasets/coco-qa/images/000000279104.jpg,sandwich,0,train
78734,where is the white toilet sitting,datasets/coco-qa/images/000000534974.jpg,bathroom,3,train


In [7]:
# Instantiate tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
train_dataset = VQADataset(train_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)
test_dataset = VQADataset(test_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=8)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=4)

### Models

In [9]:
output_size = len(mlb.classes_)
multilabel = False

In [None]:
# Train early fusion model
print("Training Early Fusion Model:")
train_early_fusion(train_loader, test_loader, output_size, num_epochs=10, multilabel=multilabel, report=True, lr=0.001)

Training Early Fusion Model:
The number of parameters of the model are: 252462
Epoch 1/10 - Test Accuracy: 0.3997


In [None]:
# Train late fusion model
print("Training Late Fusion Model:")
train_late_fusion(train_loader, test_loader, output_size, num_epochs=10, multilabel=multilabel, report=True)

In [None]:
#from src.classifiers_base import TextModel, VisionModel, EarlyFusionModel, LateFusionModel
#text_model = TextModel()
#image_model = VisionModel()
#model = EarlyFusionModel(text_model=text_model, image_model=image_model, output_size=output_size)