### Data Collection and Pre-processing

In [1]:
# Installs without warnings in conda env: dsi
#!pip install visual_genome

Collecting visual_genome
  Using cached visual_genome-1.1.1-py2.py3-none-any.whl (9.6 kB)
Collecting progressbar2
  Using cached progressbar2-3.53.1-py2.py3-none-any.whl (25 kB)
Collecting python-utils>=2.3.0
  Using cached python_utils-2.4.0-py2.py3-none-any.whl (12 kB)
Installing collected packages: python-utils, progressbar2, visual-genome
Successfully installed progressbar2-3.53.1 python-utils-2.4.0 visual-genome-1.1.1


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import json
from zipfile import ZipFile
# Import OpenCV(cv2 module)
import cv2
# Import Python's standard utility module 'os' for interacting with Operating System
import os
# Import the Python wrapper for the VG API
from visual_genome import api

#### Run cell only once to retrieve all image ID's from Visual Genome API and then store as a txt file for future reference

In [2]:
# Retrieve an array of integers representing all VG image ID's from API
image_ids = api.get_all_image_ids()

# Save array as txt file to bypass API next time
with open('../data/image_ids.txt', 'w') as imagehandles:
    imagehandles.writelines("%s\n" % ids for ids in image_ids)

In [8]:
# Load Image ID's from text file
with open('../data/image_ids.txt', 'r') as imagehandles:
    # Store Image ID's as variable for image pre-processing, removing the endline character in the process
    image_ids = [image_id[:-1] for image_id in imagehandles]
# Check that all is copacetic
len(image_ids), type(image_ids), image_ids[0], type(image_ids[0])

(108077, list, '4001', str)

The processing method provided by the Visual Genome API, using relation mappings, is too slow to be effective in obtaining the object descriptions.

In [2]:
# Read objects.json file from Visual Genome, to retrieve the object data for each image
with open('../../objects.json', 'r') as objects_json_file:
    # Store a list of len(json_objects) = 108_077 dictionaries representing image object data
    json_objects = json.load(objects_json_file)

In [3]:
# loop thru each item in list
dog_pic_ids = []
for image_index in range(len(json_objects)):
    # Number of objects
    # len(json_objects[image_index]) = 3
    # json_objects[image_index].keys() = dict_keys(['image_id', 'objects', 'image_url'])
    num_objects = len(json_objects[image_index]['objects'])
    # loop through objects_index searching for dog objects
    objects_index = 0
    # Objects_index lives in range(num_objects)
    while objects_index < num_objects:
        # Save image_id's of images with dogs
        if json_objects[image_index]['objects'][objects_index][ 'names' ] == [  'dog'  ]:
      # if json_objects[image_index]['objects'][objects_index]['synsets'] == ['dog.n.01']:
    #####NOTICE: INSERT HOT DOG FILTER HERE LOL!
            # Save image_id's of images with dogs
            image_id = json_objects[image_index]['image_id']
            dog_pic_ids.append(image_id)
            # Escape 'while-loop' as soon as 'dog'-object found
            objects_index = num_objects
        else:
            # Move on to next object in image, continue searching
            objects_index += 1
# Check how many dog pics were discovered in the dataset
print(f'Total number of dog images in VG dataset: {len(dog_pic_ids)}')
# Save dog_pic_ids as txt file to bypass loading object data from json next time
with open('../data/dog_pic_ids.txt', 'w') as imagehandles:
    imagehandles.writelines("%s\n" % ids for ids in dog_pic_ids)

Total number of dog images in VG dataset: 3235


#### Display Dog Images in VG Part 2 Dataset:

In [4]:
# NOTICE CODE FROM
# https://thispointer.com/python-how-to-get-the-list-of-all-files-in-a-zip-archive/

# ~5.5 GB ZIP Archive, 40% of total data set
with ZipFile('../../visual_genome_part2.zip', "r") as z:
    # One file in zip archive
    VG_100K_2 = z.namelist()

    # Iterate over image file names, 'VG_100K_2/image_id.jpg'
    for ith_image in VG_100K_2: 
        # Get extension of file, '.jpg'
        ext = os.path.splitext(ith_image)[-1]
        # Get root of file, root = VG_100K_2/image_id.jpg
        root = os.path.splitext(ith_image)[0]
        
        # Skip over Archive Directory
        if (ext == ".jpg"):
            # Skip root[:10]='VG_100K_2' in dog_pic_ids
            if int(root[10:]) in dog_pic_ids:
                # Read image binary data of 'VG_100K_2' from zip archive('visual_genome_part2.zip')
                in_bytes = z.read(ith_image) # VG_100K_2/
                # Decode bytes to image
                img = cv2.imdecode(np.frombuffer(in_bytes, np.uint8), cv2.IMREAD_COLOR)
    
                # Output img with window name as 'image' 
                cv2.imshow('img', img)
                # Display for 1sec = 1_000ms
                cv2.waitKey(1000)

cv2.destroyAllWindows()

#### Identify and store dog pic image ID's specific to VG Part 2 Dataset

In [5]:
# NOTICE CODE FROM
# https://thispointer.com/python-how-to-get-the-list-of-all-files-in-a-zip-archive/

# Prepare to store dog pic image ID's in VG Part 2 Dataset
dog_data_part2 = []

# ~5.5 GB ZIP Archive, 40% of total data set
with ZipFile('../../visual_genome_part2.zip', "r") as z:
    # One file in zip archive
    VG_100K_2 = z.namelist()

    # Iterate over image file names, 'VG_100K_2/image_id.jpg'
    for ith_image in VG_100K_2: 
        # Get extension of file, '.jpg'
        ext = os.path.splitext(ith_image)[-1]
        # Get root of file, root = VG_100K_2/image_id.jpg
        root = os.path.splitext(ith_image)[0]
        
        # Skip over Archive Directory
        if (ext == ".jpg"):
            # Skip root[:10]='VG_100K_2' in dog_pic_ids
            if int(root[10:]) in dog_pic_ids:
                dog_data_part2.append(int(root[10:]))
                
# Display percent of total dog images that live in VG Part 2 dataset
print(f'Number of Dog Pics in VG Part 2 Dataset: {len(dog_data_part2)}')

# Save dog_data_part2 as txt file to load in model tuning notebook
with open('../data/dog_data_part2.txt', 'w') as imagehandles:
    imagehandles.writelines("%s\n" % ids for ids in dog_data_part2)

Number of Dog Pics in VG Part 2 Dataset: 1240


#### Identify and store dog pic image ID's specific to VG Part 1 Dataset

In [6]:
dog_data_part1 = []
for image_id in dog_pic_ids:
    if image_id not in dog_data_part2:
        dog_data_part1.append(image_id)
print(f'Number of Dog Pics in VG Part 1 Dataset: {len(dog_data_part1)}')

# Save dog_data_part2 as txt file to load in model tuning notebook
with open('../data/dog_data_part1.txt', 'w') as imagehandles:
    imagehandles.writelines("%s\n" % ids for ids in dog_data_part1)

Number of Dog Pics in VG Part 1 Dataset: 1995


In [7]:
# Check that all  is copacetic
if len(dog_data_part1) + len(dog_data_part2) == len(dog_pic_ids):
    print('Dog pics in each VG dataset identified')
else:
    print('Something is wrong')

Dog pics in each VG dataset identified


In [8]:
# Display percent of total dog images that live in VG Part 1 dataset
print(f'Percent of Dog Pics in VG Part 1 Dataset: {100*len(dog_data_part1)/len(dog_pic_ids):.2f}%')

# Display percent of total dog images that live in VG Part 2 dataset
print(f'Percent of Dog Pics in VG Part 2 Dataset: {100*len(dog_data_part2)/len(dog_pic_ids):.2f}%')

Percent of Dog Pics in VG Part 1 Dataset: 61.67%
Percent of Dog Pics in VG Part 2 Dataset: 38.33%


In [9]:
with ZipFile('../../visual_genome_part2.zip', 'r') as z:
    for filename in z.namelist():
        if not os.path.isdir(filename):
            print(filename)
            # read the file
            with z.open(filename) as f:
                print(filename)
                #imgdata = f.read('617.jpg')
                # Save image in set directory 
                # Load RGB image 
                img = cv2.imread('617.jpg')
               
                # Output img with window name as 'image' 
                cv2.imshow('image', img)  

                # Maintain output window utill 
                # user presses a key 
                cv2.waitKey(0)         

                # Destroying present windows on screen 
                cv2.destroyAllWindows()

VG_100K_2/
VG_100K_2/


error: OpenCV(4.4.0) C:\Users\appveyor\AppData\Local\Temp\1\pip-req-build-wwma2wne\opencv\modules\highgui\src\window.cpp:376: error: (-215:Assertion failed) size.width>0 && size.height>0 in function 'cv::imshow'


In [None]:
with zipfile.ZipFile('../../visual_genome_part2.zip', 'r') as zfile:
    data = f.read('1.jpg')

img = cv2.imdecode(np.frombuffer(data, np.uint8), 1)    

In [None]:
import os
import zipfile

# importing matplotlib modules 
import matplotlib.image as mpimg

with zipfile.ZipFile('../../visual_genome_part2.zip', 'r') as z2:
    for filename in z2.namelist():
        if not os.path.isdir(filename):
            # read the file
            with z2.open(filename) as f:
                #imgdata = f.read('617.jpg')
                # Save image in set directory 
                # Read RGB image 
                img = mpimg.imread('617.jpg')
                # Output img with window name as 'image' 
                plt.imshow('image', img)

In [None]:
type(img)

In [None]:
# Output img with window name as 'image' 
cv2.imshow('image', img)  
  
# Maintain output window utill 
# user presses a key 
cv2.waitKey(0)         
  
# Destroying present windows on screen 
cv2.destroyAllWindows()

In [None]:
json_objects[617]
dog_pic_ids[0]

In [None]:
# Hot Dog filter
#for dog in dog_pic_ids:
    # Consider what 