# Prepare Graph data & Q data with answer labels

In [108]:
import json
import collections
import copy

In [141]:
SRC_SG_PATH = "../VG-data/scene_graphs.json"
SRC_QA_FILE = "../VG-data/question_answers.json"

DST_ANSWER_VOCAB_FILE = "./intermediate_files/answer_vocab.txt"
# DST_SGQAS_OF_INTEREST_SG_DATA_FILE = "./intermediate_files/filtered_sg_data.json"
DST_SGQAS_OF_INTEREST_QA_DATA_FILE = "./intermediate_files/filtered_qa_data.json"

In [106]:
sg_data = json.load(open(SRC_SG_PATH, 'r'))
print(len(sg_data)) # num-images

qa_data = json.load(open(SRC_QA_FILE, 'r'))
print(len(qa_data)) # num-qa (== num-img)

108077
108077


In [7]:
sg_image = sg_data[0]
print(sg_image.keys())

qa_for_image = qa_data[0]
print(qa_for_image.keys())

dict_keys(['relationships', 'image_id', 'objects'])
dict_keys(['id', 'qas'])


# Answer Ground Truth Generation

In [73]:
# Let's do QA on single word QAs

qa_answers = []
for qas_index, qas in enumerate(qa_data):
    for qa in qas['qas']:
        answer = qa['answer'].replace(".", "") # preprocessing-1
        if len(answer.split(" ")) > 1: # multiple answers
            continue
        elif len(answer.split(",")) > 1: # multi-seq answers
            continue
        elif not answer.isalnum():
            continue
        
#         if answer.isdigit():
#             print(answer)
#         multi_answers = answer.split(",")
#         if len(multi_answers) > 1:
#             qa_answers.extend(multi_answers)
        else:
            try:
                qa_answer_int = int(answer) # '3', '456'
                if len(answer) == 1: # '3'
                    qa_answers.append(answer)
#                 else:
#                     print(answer)
            except: # 'clock'                
                qa_answers.append(answer.lower())

    if (qas_index == 1000000): break

print("len of answers : ", len(qa_answers))
qa_answers_set = list(set(qa_answers))
# for qa_answer in qa_answers: print(qa_answer)
print("len of set of answers : ", len(qa_answers_set))

len of answers :  749815
len of set of answers :  12810


In [79]:
answer_vocab_dict = collections.Counter(qa_answers) # elements, get, items, keys, values, most_common
len(set(answer_vocab_dict.values()))

526

### So can take all unique single answer tokens as classification class labels

In [81]:
with open(DST_ANSWER_VOCAB_FILE, 'w') as f:
    for answer_vocab in answer_vocab_dict.keys():
        f.write("{}\n".format(answer_vocab))

# Make Filtered Dataset File

In [145]:
# filtered_data = collections.defaultdict(list)
n_data = len(sg_data)
sample_cnt = 0
for sample_img, sample_ans in zip(sg_data, qa_data):
    if sample_img['image_id'] != sample_ans['id']:
        print("IDs did not match !")
        continue
    
    # no constraint on SG
    # sample_sg_data = copy.deepcopy(sample_img)
    # del sample_sg_data['image_id']
    # no constraint on Question
    # answer constraint
    for qa_index, qa in enumerate(sample_ans['qas']):
            question = qa['question']
        
            answer = qa['answer'].replace(".", "") # preprocessing-1
            if len(answer.split(" ")) > 1: # multiple answers
                qa['qas_skip'] = 1
                continue
            elif len(answer.split(",")) > 1: # multi-seq answers
                qa['qas_skip'] = 1
                continue
            elif not answer.isalnum():
                qa['qas_skip'] = 1
                continue
            else:
#                 sample_composed_data = {"qas_id": qa['qa_id'],
#                                         "question": question, 
#                                         "answer": answer,
#                                         "sg_objects": sample_sg_data['objects'], "sg_relationships": sample_sg_data['relationships']}
                try:
                    qa_answer_int = int(answer) # '3', '456'
                    if len(answer) == 1: # '3'
                        # filtered_data[sample_img['image_id']].append(sample_composed_data)
                        qa['qas_skip'] = 0
                    else:
                        qa['qas_skip'] = 1
                except: # 'clock'                
                    # filtered_data[sample_img['image_id']].append(sample_composed_data)
                    qa['qas_skip'] = 0
    sample_cnt += 1
    if (sample_cnt % 10000 == 0):
        print("processed samples of : {} / {}".format(sample_cnt, n_data))

processed samples of : 10000 / 108077
processed samples of : 20000 / 108077
processed samples of : 30000 / 108077
processed samples of : 40000 / 108077
processed samples of : 50000 / 108077
processed samples of : 60000 / 108077
processed samples of : 70000 / 108077
processed samples of : 80000 / 108077
processed samples of : 90000 / 108077
processed samples of : 100000 / 108077


In [146]:
with open(DST_SGQAS_OF_INTEREST_QA_DATA_FILE, 'w') as f_qa:
    json.dump(qa_data, f_qa)

In [149]:
sg_image = sg_data[0]
sg_image.keys()

dict_keys(['relationships', 'image_id', 'objects'])

In [151]:
sg_image['relationships'][0]

{'synsets': ['along.r.01'],
 'predicate': 'ON',
 'relationship_id': 15927,
 'object_id': 5046,
 'subject_id': 5045}

In [152]:
sg_image['objects'][0]

{'synsets': ['clock.n.01'],
 'h': 339,
 'object_id': 1058498,
 'names': ['clock'],
 'w': 79,
 'attributes': ['green', 'tall'],
 'y': 91,
 'x': 421}

# Save Data Features

In [163]:
import nltk
import spacy
import gensim
import en_core_web_sm
from nltk.data import find
from nltk.corpus import wordnet
import numpy as np

In [161]:
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)
lmtzr = nltk.WordNetLemmatizer()

In [197]:
DST_SG_FEATURES_DATA_FILE = "./intermediate_files/sg_features.json"

In [211]:
feature_data = []
n_img_data = len(sg_data)
for img_idx, sample_img in enumerate(sg_data):
    img_id = sample_img['image_id']
        
    obj_features_list = []
    for obj in sample_img['objects']:
            obj_name = obj['names'][0]
            obj_name_seq = nltk.word_tokenize(obj_name.lower())
            obj_name_token_lemma = [ lmtzr.lemmatize(token) for token in obj_name_seq ]
            emb_matrix = []
            for token in obj_name_token_lemma:
                try:
                    tmp_vec = model[token]
                    emb_matrix.append(tmp_vec)
                except KeyError:
                    continue
            if emb_matrix == []:
                emb_matrix = np.random.normal(0, 1, size=(300,))
            emb_matrix = np.asarray(emb_matrix, np.float32)
            semantic_feature = np.mean(emb_matrix, axis=0) # feature
            
            obj_features_list.append({"object_id": obj['object_id'], "feature": semantic_feature.tolist()})        

    feature_data.append({"image_id": img_id, "objects": obj_features_list})
           
    if img_idx % 100 == 0: print("Finished extracting features : {}/{}".format(img_idx + 1, n_img_data))
    if img_idx == 1000: break


with open(DST_SG_FEATURES_DATA_FILE, 'w') as write_file:
    json.dump(feature_data, write_file)

Finished extracting features : 1/108077
Finished extracting features : 101/108077
Finished extracting features : 201/108077
Finished extracting features : 301/108077
Finished extracting features : 401/108077
Finished extracting features : 501/108077
Finished extracting features : 601/108077
Finished extracting features : 701/108077
Finished extracting features : 801/108077
Finished extracting features : 901/108077
Finished extracting features : 1001/108077


## Verifying dumped json

In [208]:

for sample_img, sample_feat, sample_ans in zip(sg_data, feature_data, qa_data):
    if sample_img['image_id'] != sample_ans['id']:
        print("IDs did not match !")
        continue
    
    for obj, feat in zip(sample_img['objects'], sample_feat['objects']):
        if obj['object_id'] != feat['object_id']:
            print("IDs did not match !")
            continue
        feat_matrix = np.asarray(feat['feature'], np.float32)
        print(np.mean(feat_matrix))
        break
        
    break

-0.004914313


# Faster Feature Dump

In [237]:
DST_SG_FEATURES_DATA_FOLDER = "./intermediate_files/sg_features/"

import os
import h5py

if not os.path.exists(DST_SG_FEATURES_DATA_FOLDER):
    os.makedirs(DST_SG_FEATURES_DATA_FOLDER)

In [239]:

n_img_data = len(sg_data)
for img_idx, sample_img in enumerate(sg_data):
    img_id = sample_img['image_id']
    
    save_feature_file = os.path.join(DST_SG_FEATURES_DATA_FOLDER, "{}.h5".format(img_id))
    with h5py.File(save_feature_file, 'w') as hf:
        
        obj_features_list = []
        for obj in sample_img['objects']:
            obj_name = obj['names'][0]
            obj_name_seq = nltk.word_tokenize(obj_name.lower())
            obj_name_token_lemma = [ lmtzr.lemmatize(token) for token in obj_name_seq ]
            emb_matrix = []
            for token in obj_name_token_lemma:
                try:
                    tmp_vec = model[token]
                    emb_matrix.append(tmp_vec)
                except KeyError:
                    continue
            if emb_matrix == []:
                emb_matrix = np.random.normal(0, 1, size=(300,))
            emb_matrix = np.asarray(emb_matrix, np.float32)
            semantic_feature = np.mean(emb_matrix, axis=0) # feature

            # save_feature_file = os.path.join(DST_SG_FEATURES_DATA_FOLDER, "{}.h5".format(obj['object_id']))
            # with h5py.File(save_feature_file, 'w') as hf:
            #     hf.create_dataset("features", data=semantic_feature)
            # hf.create_dataset(str(obj['object_id']), data=semantic_feature, chunks=True, compression="gzip", compression_opts=9)
            hf.create_dataset(str(obj['object_id']), data=semantic_feature)
           
    if img_idx % 100 == 0: print("Finished extracting features : {}/{}".format(img_idx + 1, n_img_data))
    if img_idx == 1000: break


Finished extracting features : 1/108077
Finished extracting features : 101/108077
Finished extracting features : 201/108077
Finished extracting features : 301/108077
Finished extracting features : 401/108077
Finished extracting features : 501/108077
Finished extracting features : 601/108077
Finished extracting features : 701/108077
Finished extracting features : 801/108077
Finished extracting features : 901/108077
Finished extracting features : 1001/108077


## Verfiy loading dumped features

In [246]:
n_img_data = len(sg_data)
for img_idx, sample_img in enumerate(sg_data):
    img_id = sample_img['image_id']
    
    feature_filename = os.path.join(DST_SG_FEATURES_DATA_FOLDER, "{}.h5".format(img_id))
    with h5py.File(feature_filename, 'r') as hf:
        print(hf.keys())
        
        obj_features_list = []
        for obj in sample_img['objects']:
            obj_name = obj['names'][0]
            print(obj_name)

            feature_vec = np.array(hf.get(str(obj['object_id'])))


            print(feature_vec.shape)

            break
        
    break
    

<KeysViewHDF5 ['1058498', '1058507', '1058508', '1058511', '1058515', '1058518', '1058519', '1058525', '1058528', '1058529', '1058530', '1058531', '1058532', '1058534', '1058535', '1058536', '1058539', '1058540', '1058541', '1058542', '1058543', '1058544', '1058545', '1058546', '1058547', '1058548', '1058549', '3798575', '3798576', '3798577', '3798578', '3798579', '5045', '5046', '5048', '5049', '5050', '5051', '5055', '5060']>
clock
(300,)
