# Import libraries

In [1]:
import os

import glob
import pickle

## All the files required for VQG captioning are stored inside data_preprocessing for creating VQG_text folder

In [2]:
directory = 'data_preprocessing/'

### data_preprocessing/org_vqg_src folder 
    
   -->    flickr_train.pickle
   
   -->    flickr_test.pickle
   
   -->    coco_train.pickle
   
   -->    coco_test.pickle
   
   -->    bing_train.pickle
   
This pickle files is a dataset, that is list of ['img_id','img_link','q1---q2--q3']

In [3]:
org_vqg = 'org_vqg_src/'

In [4]:
if not os.path.exists(directory):
    os.mkdir(directory)
    print("You need to STORE your pickle version of csv files in data_preprocessing/org_vqg_src/")

In [5]:
if not os.path.exists(directory+org_vqg):
    os.mkdir(directory+org_vqg)
    print("You need to STORE your pickle version of csv files in data_preprocessing/org_vqg_src/")

In [6]:
pickle_files =  glob.glob(directory+org_vqg+"*.pickle")

#### list_of_imqs = 
 [ 
 
    ['image_id', ['q1','q2','q3'] ],
    ['image_id',['q1','q2','q3'] ] 
    
 ]

For every pickle file, 

    1.open list
    2.filter questions ('q1---q2---q3') to ['q1','q2','q3']
    3.Append 'img_id' , ['q1','q2','q3'] to list_of_imqs

In [7]:
list_of_imqs=[]
for pic_fil in pickle_files:
    
    with open(pic_fil,"rb") as fh:
        
        list_of_imlinkqs = pickle.load(fh)  #[['img_id','img_link','q1---q2---q3']]
        
        
        
        for instance in list_of_imlinkqs:
            
            questions = instance[2].split("---")     # Getting multiple questions
            questions= [ques[:-1] for ques in questions]  # Removing "?" from every question to remove bias
            img_id = instance[0]
            list_of_imqs.append([str(img_id),questions])
            
        
            
        
    

## img_question_dict =

    key     =   val
    img_id  =   ['q1','q2','q3']
    

In [8]:
img_question_dict=dict()
for img_id,questions in list_of_imqs:
    img_question_dict[str(img_id)] = questions

In [9]:
with open(directory+"img2questions_dict.pickle","wb") as fh:
    pickle.dump(img_question_dict,fh)

In [10]:
text_directory='VQg_text/'

In [11]:
if not os.path.exists(text_directory):
    os.mkdir(text_directory)
    print("For storing info on img, questions and train,test splits")

In [12]:
with open(directory+"img2questions_dict.pickle","rb") as fh:
    img_question_dict=pickle.load(fh)

In [13]:
img_directory = 'VQg_Dataset/VQG_Dataset/'

### img_files = 

[

'VQg_Dataset/VQG_Dataset\\00051bba-46a4-4aac-876d5c18bb32fc74.jpg',

 'VQg_Dataset/VQG_Dataset\\0043f1ba-1028-4d37-9a7e4f2204978749.jpg',
 
 'VQg_Dataset/VQG_Dataset\\00472679-97c5-449a-9ece4d55370344f4.jpg'

]

In [14]:

img_files = glob.glob(img_directory+"*.*")


### img_file_names = 

[

'00051bba-46a4-4aac-876d5c18bb32fc74.jpg',

 '0043f1ba-1028-4d37-9a7e4f2204978749.jpg',
 
 '00472679-97c5-449a-9ece4d55370344f4.jpg'

]

In [15]:
img_file_names = [os.path.basename(x) for x in img_files] ## removing the path part, keeping only the filenames

### img_id_file_map = dict()

    key      =     val

    img_id   =   corresponding image file name
    
    img_id   =   img_id.jpg
    



In [16]:
img_id_file_map = dict()

for img_fil in img_file_names:
    img_id_file_map[str((img_fil.split("."))[0] )] = img_fil 



### VQG.token.txt file has

    img1_.jpg#0 \t question1
    
    img1_.jpg#1 \t question2
    
    img2_.jpg#0 \t question1

In [17]:

text = ''

for img,questions in img_question_dict.items():
    
    for indx,question in enumerate(questions):
        
        try:
            img_address = img_id_file_map[img]
            text += img_address +"#"+str(indx)+"\t"+question+"\n"
            
        except:
            
            pass

text =text[:-1]

In [18]:
img_capt_file = 'VQG.token.txt'

with open(text_directory+img_capt_file,'w') as fh:
    fh.write(text)

In [19]:
training_size = len(img_files)

### VQG.trainImages.txt has

    img1.jpg
    
    img2.jpg
    
    .. All training instances

In [20]:
train_file='VQG.trainImages.txt'
test_file='VQG.testImages.txt'
val_file='VQG.devImages.txt'


In [21]:
train_size = int( len(img_files) * 0.8)
test_size = int(len(img_files) * 0.1)
val_size = int(len(img_files) * 0.1)


In [22]:
import numpy as np
img_filnames_arr = np.array(img_file_names)

In [23]:
from random import shuffle
import numpy as np

np.random.seed(9)

np.random.shuffle(img_filnames_arr)

In [24]:
text = ''

for img in img_filnames_arr[:train_size]:
    text+=img+"\n"

text = text [:-1]

In [25]:
with open(text_directory+train_file,'w') as fh:
    fh.write(text)

In [26]:
text = ''

for img in img_filnames_arr[train_size:train_size + val_size]:
    text+=img+"\n"

text = text [:-1]

with open(text_directory+val_file,'w') as fh:
    fh.write(text)

In [27]:
text = ''

starting_idx = train_size + val_size

for img in img_filnames_arr[starting_idx:starting_idx + test_size]:
    text+=img+"\n"

text = text [:-1]

with open(text_directory+test_file,'w') as fh:
    fh.write(text)