### Modeling COCO - QA
* [Review](https://arxiv.org/abs/1610.01465)

In [1]:
%cd ..

/home/datascience/Data Fusion


### Setup Environment:

In [2]:
import os
import pandas as pd

from src.classifiers import process_labels, split_data
from src.classifiers_base import preprocess_df

from transformers import BertTokenizer

from src.multimodal_data_loader import VQADataset
from torch.utils.data import DataLoader

from src.classifiers_base_cpu_metrics import calculate_memory

In [3]:
PATH = 'datasets/coco-qa/'

In [4]:
text_path = os.path.join(PATH, 'labels.csv')
images_path = os.path.join(PATH, 'images')

## Get data

In [5]:
df = pd.read_csv(text_path)
df

Unnamed: 0,questions,image_id,answers,types,split
0,what is using umbrellas as a central theme,397899,sculpture,0,train
1,what walks toward the rope perimeter fence,310683,elephant,0,train
2,what is the color of the horses,23004,brown,2,train
3,where is the black cat laying down,117931,sink,3,train
4,what is the color of the character,220218,purple,2,train
...,...,...,...,...,...
117679,what are there grouped together here,406426,vegetables,0,test
117680,what serves as the train trestle,545581,bridge,0,test
117681,what is the color of the plate,40404,white,2,test
117682,what is sleeping on the blue couch,570521,cat,0,test


## Data Perparation

In [None]:
# Select features and labels vectors
text_columns = 'questions'
image_columns = 'image_id'
label_columns = 'answers'

df = preprocess_df(df, image_columns, images_path)

# Split the data
train_df, test_df = split_data(df)

# Process and one-hot encode labels for training set
train_labels, mlb, train_columns = process_labels(train_df, col=label_columns)
test_labels = process_labels(test_df, col=label_columns, train_columns=train_columns)

 47%|████▋     | 55360/117684 [03:01<02:52, 360.86it/s]

In [None]:
train_df

In [None]:
# Instantiate tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
train_dataset = VQADataset(train_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)
test_dataset = VQADataset(test_df, text_columns, image_columns, label_columns, mlb, train_columns, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

### Models

In [None]:
output_size = len(mlb.classes_)
multilabel = False

In [None]:
calculate_memory(train_loader, test_loader, output_size)