### Modeling DAQUAR
* [Dataset](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/vision-and-language/visual-turing-challenge)

* [Original Paper](chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://proceedings.neurips.cc/paper_files/paper/2014/file/d516b13671a4179d9b7b458a6ebdeb92-Paper.pdf)

### Setup Environment:

In [1]:
import os
import pandas as pd

from src.classifiers import process_labels, split_data
from src.classifiers_base import preprocess_df

from transformers import BertTokenizer

from src.multimodal_data_loader import VQADataset
from torch.utils.data import DataLoader

from src.classifiers_base import train_early_fusion, train_late_fusion

In [2]:
PATH = 'datasets/daquar/'

In [3]:
text_path = os.path.join(PATH, 'labels.csv')
images_path = os.path.join(PATH, 'images')

## Get data

In [4]:
df = pd.read_csv(text_path)
df

Unnamed: 0,question,image_id,answer,split
0,what is on the right side of the black telepho...,image3,desk,train
1,what is in front of the white door on the left...,image3,telephone,train
2,what is on the desk in the image3 ?,image3,"book, scissor, papers, tape_dispenser",train
3,what is the largest brown objects in this imag...,image3,carton,train
4,what color is the chair in front of the white ...,image3,red,train
...,...,...,...,...
12463,what is found below the chandelier in the imag...,image1448,table,test
12464,what is on the floor in the image1449 ?,image1449,rug,test
12465,what are around dining table in the image1449 ?,image1449,chair,test
12466,what is at the opposite side of the dining tab...,image1449,decoration_item,test


## Data Perparation

In [None]:
# Select features and labels vectors
text_columns = 'question'
image_columns = 'image_id'
label_columns = 'answer'

df = preprocess_df(df, image_columns, images_path)

# Split the data
train_df, test_df = split_data(df)

# Process and one-hot encode labels for training set
train_labels, mlb, train_columns = process_labels(train_df, col=label_columns)
test_labels = process_labels(test_df, col=label_columns, train_columns=train_columns)

In [None]:
train_df

In [None]:
# Instantiate tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
train_dataset = VQADataset(train_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)
test_dataset = VQADataset(test_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=4)

### Models

In [None]:
output_size = len(mlb.classes_)
multilabel = True

In [None]:
# Train early fusion model
print("Training Early Fusion Model:")
train_early_fusion(train_loader, test_loader, output_size, num_epochs=30, multilabel=multilabel, report=True, lr=0.0001)

In [None]:
# Train late fusion model
print("Training Late Fusion Model:")
train_late_fusion(train_loader, test_loader, output_size, num_epochs=30, multilabel=multilabel, report=True)

In [None]:
#from src.classifiers_base import TextModel, VisionModel, EarlyFusionModel, LateFusionModel
#text_model = TextModel()
#image_model = VisionModel()
#model = EarlyFusionModel(text_model=text_model, image_model=image_model, output_size=output_size)