## Downloading Packages

In [13]:
# Run this cell to import the packages you will need to unpack the dataset
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image
import pickle
import os
import zipfile
import scipy.ndimage
import tensorflow as tf

import sys
import json
import codecs

## Loading Data

In [14]:
data = pd.read_csv("vqa_mixed_yes_no_1.csv").iloc[:, 1:]
# Preview the first 5 lines of the loaded data 
data.head(5000)

Unnamed: 0,question,answer
0,are these energy bars,no
1,are these waters,no
2,are these waters,no
3,are these cheetos,no
4,are these juices,yes
5,are these juices,yes
6,are these apples,no
7,are these juices,yes
8,are these juices,yes
9,are these juices,yes


In [15]:
data.shape

(4660, 2)

## Preprocessing the Questions

In [16]:
## getting the dictionary
pkl_file = open("word_index_VQA_3.pickle", 'rb')
word_index = pickle.load(pkl_file)


In [17]:
word_index

{'a': 6,
 'an': 8,
 'apple': 57,
 'apples': 58,
 'are': 3,
 'banana': 74,
 'bananas': 76,
 'bar': 13,
 'bars': 14,
 'bean': 35,
 'beans': 36,
 'beef': 7,
 'beefs': 73,
 'breakfast': 11,
 'cereal': 33,
 'cereals': 34,
 'cheese': 75,
 'cheeses': 77,
 'cheeto': 46,
 'cheetos': 49,
 'chip': 9,
 'chips': 10,
 'chocolate': 15,
 'condiment': 16,
 'condiments': 17,
 'cookie': 25,
 'cookies': 26,
 'cupcake': 29,
 'cupcakes': 30,
 'drink': 45,
 'drinks': 48,
 'egg': 80,
 'eggs': 81,
 'energy': 12,
 'food': 41,
 'foods': 42,
 'granola': 21,
 'is': 1,
 'jerkies': 64,
 'jerky': 63,
 'juice': 31,
 'juices': 32,
 'mac': 27,
 'milk': 37,
 'milks': 38,
 'muffin': 65,
 'muffins': 67,
 'mushroom': 82,
 'mushrooms': 83,
 'n': 28,
 'noodle': 51,
 'noodles': 52,
 'nut': 84,
 'nuts': 85,
 'pie': 61,
 'pies': 62,
 'pill': 19,
 'pills': 20,
 'pizza': 47,
 'pizzas': 50,
 'pot': 18,
 'pretzel': 59,
 'pretzels': 60,
 'pudding': 53,
 'puddings': 54,
 'raw': 22,
 'salt': 71,
 'salts': 72,
 'soda': 23,
 'sodas': 24,

- Updating the Dictionary

In [None]:
## updating the dictionary if new words were added to the dataset

file = open('new_questions.txt', 'r')

curr_index = len(word_index)

for line in file:
    words = line.split(' ')
    for w in words:
        if(w not in word_index.keys):
            word_index.update({w: curr_index})
            curr_index+=1



In [12]:
### update words more easily

word_index.update({'jerkies': 64})

In [None]:
indices = []

In [17]:
### remove words if necessary
w = 'tomatos'

indices.append(word_index[w])
del word_index[w]

In [23]:
indices.append(78)

In [24]:
indices

[87, 78]

In [27]:
## saving the file
pickle_out = open("word_index_VQA_3.pickle", "wb")
pickle.dump(word_index, pickle_out)
pickle_out.close()

- Converting Questions to Sequences

In [23]:
## create a sequence for each new question and pad to length 50
from keras.preprocessing.sequence import pad_sequences

all_seq = np.zeros((4660, 50))
i = 0
for q in data['question']:
    #print('hi')
    words = q.split(' ')
    new_seq = []
    for w in words:
        new_seq.append(word_index.get(w))
    all_seq[i][-len(new_seq):]= new_seq
    i+=1
print(np.asarray(all_seq))
trans_all_qs = pad_sequences(all_seq, maxlen=50)
trans_all_qs.shape

[[ 0.  0.  0. ...  4. 12. 14.]
 [ 0.  0.  0. ...  3.  4. 40.]
 [ 0.  0.  0. ...  3.  4. 40.]
 ...
 [ 0.  0.  0. ...  3.  4. 10.]
 [ 0.  0.  0. ...  3.  4. 20.]
 [ 0.  0.  0. ...  3.  4. 10.]]


(4660, 50)

## Preprocessing the Answers

In [24]:
# OTHERWISE, load the data
file = open("ans_index_VQA_4.pickle", "rb")
ans_class = pickle.load(file)
file.close()

In [25]:
ans_class

{'apple': 31,
 'banana': 5,
 'bean': 4,
 'beef jerky': 6,
 'breakfast food': 20,
 'cereal': 9,
 'cheeto': 22,
 'chip': 10,
 'chocolate drink': 23,
 'condiment': 21,
 'cookie': 12,
 'cupcake': 13,
 'egg': 17,
 'energy bar': 18,
 'granola bar': 11,
 'juice': 2,
 'mac n cheese': 24,
 'milk': 14,
 'muffin': 15,
 'mushroom': 25,
 'no': 1,
 'noodle': 8,
 'nut': 26,
 'pill': 19,
 'pizza': 27,
 'pot pie': 28,
 'pretzel': 29,
 'pudding': 30,
 'raw beef': 7,
 'salt': 32,
 'soda': 16,
 'soup': 36,
 'sugar': 34,
 'tomato': 35,
 'tuna': 37,
 'water': 3,
 'yes': 0,
 'yogurt': 33}

In [26]:
# DO THIS ONCE, converting the answers into one hot encode and saving the answer IDs!
import pickle

ID = []
for i in range(data.shape[0]):
    ID.append(ans_class[data['answer'][i]])
one_hot_labels = tf.keras.utils.to_categorical(ID)

one_hot_labels

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [27]:
## if not the right size, expand
z = np.zeros((one_hot_labels.shape[0], len(ans_class.keys())), dtype=one_hot_labels.dtype)
z[:, :2] = one_hot_labels
one_hot_labels = z

In [28]:
one_hot_labels.shape
one_hot_labels

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
# IF ANS_CLASS ALREADY EXISTS, add the new answers
file = open('new_answers.txt', 'r')

curr_index = len(word_index)

answer_set = set(ans_class)
for line in file:
    if line not in answer_set:
        ans_class.append(line)

In [18]:
## Saving the ANS_CLASS
pickle_out = open("ans_index_only_yes.pickle", "wb")
pickle.dump(ans_index, pickle_out)
pickle_out.close()


NameError: name 'ans_index' is not defined

## Preprocessing the Images

In [29]:
## DO THIS ONCE
### getting all the pickle files and combining them into one training list
train_imgs = []

with open('imgs_0_1000.p', "rb") as input_file:
    train_imgs = pickle.load(input_file)

train_imgs2 = []
with open('imgs_1000_2000.p', "rb") as input_file:
    train_imgs2 = pickle.load(input_file)

train_imgs3 = []
with open('imgs_2000_3000.p', "rb") as input_file:
    train_imgs3 = pickle.load(input_file)

train_imgs4 = []
with open('imgs_3000_4000.p', "rb") as input_file:
    train_imgs4 = pickle.load(input_file)

train_imgs5 = []
with open('imgs_4000_remaining.p', "rb") as input_file:
    train_imgs5 = pickle.load(input_file)

In [30]:
## DO THIS ONCE
all_train_imgs = train_imgs + train_imgs2 + train_imgs3 + train_imgs4 + train_imgs5

all_qs = data['question']

all_imgs = np.asarray(all_train_imgs)

## Test, Train, Validation

In [31]:
### may want to run this multiple times, split the questions and images into training, validation, testing
arr = np.arange(all_imgs.shape[0])
np.random.shuffle(arr)
test_ind = arr[0:arr.shape[0]//5]
val_ind = arr[arr.shape[0]//5: 9* arr.shape[0]//25]
train_ind = arr[9*arr.shape[0]//25:arr.shape[0]]

In [32]:
## change the multiplier of the indices
## append the new questions and answers to the current

# change this value if necessary
k = 0

train_ind2 = train_ind+arr.shape[0]*k
test_ind2 = test_ind + arr.shape[0]*k
val_ind2 = val_ind + arr.shape[0]*k



In [33]:
train_qs = trans_all_qs[train_ind2]
val_qs = trans_all_qs[val_ind2]
test_qs = trans_all_qs[test_ind2]



train_ans = one_hot_labels[train_ind2]
val_ans = one_hot_labels[val_ind2]
test_ans = one_hot_labels[test_ind2]

In [34]:
train_ims = all_imgs[train_ind]
val_ims = all_imgs[val_ind]
test_imgs = all_imgs[test_ind]

In [35]:
## saving the current npy file (first iteration)
np.save("train_qs_6_5.npy", train_qs)
np.save("val_qs_6_5.npy", val_qs)
np.save("test_qs_6_5.npy", test_qs)

np.save("train_ims_6_5.npy", train_ims)
np.save("val_ims_6_5.npy", val_ims)
np.save("test_ims_6_5.npy", test_imgs)

np.save("train_ans_6_5.npy", train_ans)
np.save("val_ans_6_5.npy", val_ans)
np.save("test_ans_6_5.npy", test_ans)

In [15]:
## loading the current arrays
train_qs = np.load("train_qs_4_3.npy")
val_qs = np.load("val_qs_4_3.npy")
test_qs = np.load("test_qs_4.npy")

train_ans = np.load("train_ans_4_3.npy")
val_ans = np.load("val_ans_4_3.npy")
test_ans = np.load("test_ans_4_3.npy")

In [None]:
train_qs = np.concatenate((train_qs, trans_all_qs[train_ind2]), axis = 0)
val_qs = np.concatenate((val_qs, trans_all_qs[val_ind2]), axis = 0)
test_qs = np.concatenate((test_qs, trans_all_qs[test_ind2]), axis = 0)

train_ans = np.concatenate((train_ans, one_hot_labels[train_ind2]), axis = 0)
val_ans = np.concatenate((val_ans, one_hot_labels[val_ind2]), axis = 0)
test_ans = np.concatenate((test_ans, one_hot_labels[test_ind2]), axis = 0)

In [20]:
train_qs.shape

(8949, 50)