### Data Collection and Pre-processing

In [1]:
# Installs without warnings in conda env: dsi
#!pip install visual_genome

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import json
from zipfile import ZipFile
# Import OpenCV(cv2 module)
import cv2
# Import Python's standard utility module 'os' for interacting with Operating System
import os
# Import the Python wrapper for the VG API
from visual_genome import api

#### Run cell only once to retrieve all image ID's from Visual Genome API and then store as a txt file for future reference

In [4]:
# Retrieve an array of integers representing all VG image ID's from API
image_ids = api.get_all_image_ids()

# Save array as txt file to bypass API next time
with open('../data/image_ids.txt', 'w') as imagehandles:
    imagehandles.writelines("%s\n" % ids for ids in image_ids)

In [2]:
# Load Image ID's from text file
with open('../data/image_ids.txt', 'r') as imagehandles:
    # Store Image ID's as variable for image pre-processing, removing the endline character in the process
    image_ids = [image_id[:-1] for image_id in imagehandles]
# Check that all is copacetic
len(image_ids), type(image_ids), image_ids[0], type(image_ids[0])

(108077, list, '1', str)

The processing method provided by the Visual Genome API, using relation mappings, is too slow to be effective in obtaining the object descriptions.

In [27]:
# Read objects.json file from Visual Genome, to retrieve the object data for each image
with open('../../objects.json', 'r') as objects_json_file:
    # Store a list of len(json_objects) = 108_077 dictionaries representing image object data
    json_objects = json.load(objects_json_file)
print(f'No. of Images in dataset: {len(json_objects):,} stored in "json_objects", a {type(json_objects)} of {type(json_objects[0])}s')

No. of Images in dataset: 108,077 stored in "json_objects", a <class 'list'> of <class 'dict'>s


In [9]:
# loop thru each item in list
dog_pic_ids = []
for image_index in range(len(json_objects)):
    # Number of objects
    # len(json_objects[image_index]) = 3
    # json_objects[image_index].keys() = dict_keys(['image_id', 'objects', 'image_url'])
    num_objects = len(json_objects[image_index]['objects'])
    # loop through objects_index searching for dog objects
    objects_index = 0
    # Objects_index lives in range(num_objects)
    while objects_index < num_objects:
        # Save image_id's of images with dogs
        if json_objects[image_index]['objects'][objects_index][ 'names' ] == [  'dog'  ]:
      # if json_objects[image_index]['objects'][objects_index]['synsets'] == ['dog.n.01']:
    #####NOTICE: INSERT HOT DOG FILTER HERE LOL!
            # Save image_id's of images with dogs
            image_id = json_objects[image_index]['image_id']
            dog_pic_ids.append(image_id)
            # Escape 'while-loop' as soon as 'dog'-object found
            objects_index = num_objects
        else:
            # Move on to next object in image, continue searching
            objects_index += 1
# Check how many dog pics were discovered in the dataset
print(f'Total number of dog images in VG dataset: {len(dog_pic_ids)}')
# Save dog_pic_ids as txt file to bypass loading object data from json next time
with open('../data/dog_pic_ids.txt', 'w') as imagehandles:
    imagehandles.writelines("%s\n" % ids for ids in dog_pic_ids)

Total number of dog images in VG dataset: 3235


#### Identify and store dog pic image ID's specific to VG Part 2 Dataset

In [10]:
# NOTICE CODE FROM
# https://thispointer.com/python-how-to-get-the-list-of-all-files-in-a-zip-archive/

# Prepare to store dog pic image ID's in VG Part 2 Dataset
dog_data_part2 = []

# ~5.5 GB ZIP Archive, 40% of total data set
with ZipFile('../../visual_genome_part2.zip', "r") as z:
    # One file in zip archive
    VG_100K_2 = z.namelist()

    # Iterate over image file names, 'VG_100K_2/image_id.jpg'
    for ith_image in VG_100K_2: 
        # Get extension of file, '.jpg'
        ext = os.path.splitext(ith_image)[-1]
        # Get root of file, root = VG_100K_2/image_id.jpg
        root = os.path.splitext(ith_image)[0]
        
        # Skip over Archive Directory
        if (ext == ".jpg"):
            # Skip root[:10]='VG_100K_2' in dog_pic_ids
            if int(root[10:]) in dog_pic_ids:
                dog_data_part2.append(int(root[10:]))
                
# Display percent of total dog images that live in VG Part 2 dataset
print(f'Number of Dog Pics in VG Part 2 Dataset: {len(dog_data_part2)}')

# Save dog_data_part2 as txt file to load in model tuning notebook
with open('../data/dog_data_part2.txt', 'w') as imagehandles:
    imagehandles.writelines("%s\n" % ids for ids in dog_data_part2)

Number of Dog Pics in VG Part 2 Dataset: 1240


In [11]:
dog_pics_objs_dict2 = []
for objects in range(len(json_objects)):
    for dog in dog_data_part2:
        if json_objects[objects]['image_id'] == dog:
            dog_pics_objs_dict2.append(json_objects[objects])

In [14]:
dogject_ids = []
for image_idx in range(len(dog_pics_objs_dict2)):
    for object_idx in range(len(dog_pics_objs_dict2[image_idx]['objects'])):
        if dog_pics_objs_dict2[image_idx]['objects'][object_idx]['synsets'] == ['dog.n.01']:
            dogject_ids.append(dog_pics_objs_dict2[image_idx]['objects'][object_idx]['object_id'])
print(len(dogject_ids))
print(len(set(dogject_ids)))

1419
1419


In [15]:
dogject_ids.sort()

#### Identify and store dog pic image ID's specific to VG Part 1 Dataset

In [16]:
dog_data_part1 = []
for image_id in dog_pic_ids:
    if image_id not in dog_data_part2:
        dog_data_part1.append(image_id)
print(f'Number of Dog Pics in VG Part 1 Dataset: {len(dog_data_part1)}')

# Save dog_data_part2 as txt file to load in model tuning notebook
with open('../data/dog_data_part1.txt', 'w') as imagehandles:
    imagehandles.writelines("%s\n" % ids for ids in dog_data_part1)

Number of Dog Pics in VG Part 1 Dataset: 1995


In [17]:
# Check that all  is copacetic
if len(dog_data_part1) + len(dog_data_part2) == len(dog_pic_ids):
    print('Dog pics in each VG dataset identified')
else:
    print('Something is wrong')

Dog pics in each VG dataset identified


In [18]:
# Display percent of total dog images that live in VG Part 1 dataset
print(f'Percent of Dog Pics in VG Part 1 Dataset: {100*len(dog_data_part1)/len(dog_pic_ids):.2f}%')

# Display percent of total dog images that live in VG Part 2 dataset
print(f'Percent of Dog Pics in VG Part 2 Dataset: {100*len(dog_data_part2)/len(dog_pic_ids):.2f}%')

Percent of Dog Pics in VG Part 1 Dataset: 61.67%
Percent of Dog Pics in VG Part 2 Dataset: 38.33%


#### Display ****ALL**** Dog Images in Visual Genome Part 2 Dataset:

In [13]:
# NOTICE CODE FROM
# https://thispointer.com/python-how-to-get-the-list-of-all-files-in-a-zip-archive/

# ~5.5 GB ZIP Archive, 40% of total data set
with ZipFile('../../visual_genome_part2.zip', "r") as z:
    # One file in zip archive
    VG_100K_2 = z.namelist()

    # Iterate over image file names, 'VG_100K_2/image_id.jpg'
    for ith_image in VG_100K_2: 
        # Get extension of file, '.jpg'
        ext = os.path.splitext(ith_image)[-1]
        # Get root of file, root = VG_100K_2/image_id.jpg
        root = os.path.splitext(ith_image)[0]
        
        # Skip over Archive Directory
        if (ext == ".jpg"):
            # Skip root[:10]='VG_100K_2' in dog_pic_ids
            if int(root[10:]) in dog_pic_ids:
                # Read image binary data of 'VG_100K_2' from zip archive('visual_genome_part2.zip')
                in_bytes = z.read(ith_image) # VG_100K_2/
                # Decode bytes to image
                img = cv2.imdecode(np.frombuffer(in_bytes, np.uint8), cv2.IMREAD_COLOR)
    
                # Output img with window name as 'image' 
################# WARNING WARNING WARNING: Running this cell with the following line uncommented will require force kernel restart
################# Unless you wait for all images (1240 images * 1 sec hold > 20 minutes!)
                #cv2.imshow('img', img)
                # Display for 1sec = 1_000ms
                #cv2.waitKey(1000)

cv2.destroyAllWindows()

In [12]:
# Hot Dog filter clearly needed after browsing all pphotos, maybe 6-10 images are actually just hot dogs
#for dog in dog_pic_ids:
    # Consider what 

In [30]:
# Path to Training data
BASEPATH1 = '../../visual_genome_part1/VG_100K_1/'
# Path to Validation data
BASEPATH2 = '../../visual_genome_part2/VG_100K_2/'

# Images in Train set: Part 1 
LABELS1 = set() # {set of integers of image_ids }
paths1 = [] # [List of tuples ('string of full image path + file name', 'image_id')...(64_344 out of 108_077 tuples/images) ]

# Images in Validation set: Part 2 {set of integers of image_ids }
LABELS2 = set() # {set of integers of image_ids }
paths2 = [] # [List of tuples ('string of full image path + file name', 'image_id')...(43_733 out of 108_077 tuples/images) ]

# For the Training Data
print('Output from first 03 iterations thru training dataset for paths1:\n')
check = 0

for d in os.listdir(BASEPATH1):
    LABELS1.add(int(d[:-4]))
    if d[:-4] not in dog_data_part1:
        paths1.append((BASEPATH1+d, d[:-4], 0))
    elif d[:-4] in dog_data_part1:
        paths1.append((BASEPATH1+d, d[:-4], 1))   
    if check < 3:
        print('d =', d)
        print('Training LABELS: ', LABELS1)
        print('Training paths: ', paths1)
        print()
        check += 1

Output from first three iterations:

d = 10.jpg
Training LABELS:  {10}
Training paths:  [('../../visual_genome_part1/VG_100K_1/10.jpg', '10', 0)]

d = 107899.jpg
Training LABELS:  {10, 107899}
Training paths:  [('../../visual_genome_part1/VG_100K_1/10.jpg', '10', 0), ('../../visual_genome_part1/VG_100K_1/107899.jpg', '107899', 0)]

d = 107900.jpg
Training LABELS:  {10, 107899, 107900}
Training paths:  [('../../visual_genome_part1/VG_100K_1/10.jpg', '10', 0), ('../../visual_genome_part1/VG_100K_1/107899.jpg', '107899', 0), ('../../visual_genome_part1/VG_100K_1/107900.jpg', '107900', 0)]



In [33]:
# For the Validation Data
print('Output from first 03 iterations thru validation dataset for paths2:\n')
check = 0

for d in os.listdir(BASEPATH2):
    LABELS2.add(int(d[:-4]))
    if d[:-4] not in dog_data_part2:
        paths2.append((BASEPATH2+d, d[:-4], 0))
    elif d[:-4] in dog_data_part2:
        paths2.append((BASEPATH2+d, d[:-4], 1))   
    if check < 3:
        print('d =', d)
        print('Validation LABELS: ', LABELS2)
        print('Validation paths: ', paths2)
        print()
        check += 1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [34]:
# Check that all is copacetic with the Training set labeling above
count_dog_p1 = 0
count_dogless_p1 = 0
print('Output from first three iterations in training:\n')
check = 0
for path, image_id, tag_label in paths1: # dtypes = ('str', 'str', int)
    if tag_label == 1:
        count_dog_p1 += 1
    elif tag_label == 0:
        count_dogless_p1 += 1
    if check < 1:
        print('path: ', path)
        print('image_id: ', image_id)
        print('tag_label: ', tag_label)
        print()
        check += 1
print(f'Properly Labeled Dog Pics (out of 1995): {count_dog_p1}\nProperly Labeled Dogless Pics (out of 62,179):  {count_dogless_p1}')

Output from first three iterations in training:

path:  ../../visual_genome_part1/VG_100K_1/10.jpg
image_id:  10
tag_label:  0

Properly Labeled Dog Pics (out of 1995): 0
Properly Labeled Dogless Pics (out of 62,179):  64346


#### Function to prepare data for input to VGG-16 Network:

In [35]:
# resizing and converting to RGB
def load_and_preprocess_image(path):
    image = cv2.imread(path)
    image = cv2.resize(image, (224,224))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image

#### The code in the cell below will first check for an existing X_valid array of processed data stored in .txt files
> * If the try results in a FinalNotFoundError, X-images and y-labels arrays will be created

In [36]:
%%time

# Load target labels for training data set
try:
    with open('../../assets/y_training_labels.txt', 'r') as trainlabels:
        # Store training images as variable for loading into model for scoring
        y_train_loaded = np.array([train_image_label for train_image_label in trainlabels])

    # Load feature matrix of training images from text file
    with open('../../assets/X_training_image_data.txt', 'r') as trainimages:
        # Store training images as variable for loading into model for scoring
        X_train_loaded = np.array([processed_image for processed_image in trainimages])

    # Check that all is copacetic
    len(X_train), type(dog_data_part1), dog_data_part1[0], type(dog_data_part1[0])
    # No. of target labels/tags equal to one
    print('Final No. of Dog labels in training set: ', count_dogs) 
    # Length of list of images
    print('Final Length X_train: ', len(X_train)) 
# Just in case the data can be loaded from text, we want to know!
except FileNotFoundError:
    print('HEADS UP: Could not load taining images/labels from .txt files...\n\
                \t...hold on for ~10-15 mins while X and y are prepared')
    print()
    # Build a feature matrix 
    X_train = []
    y_train = []
    check = 0
    count_dogs = 0
    # Recall that paths1 and paths2 have 3-tuple info for each image in dataset
    for  path, image_id, tag_label in paths1:  # dtypes = ('str', 'str', int)
        #for image_path in os.listdir(path):
        image = load_and_preprocess_image(path)
         # X[image_index] image corresponds to image_id = paths1[image_index][1]
        X_train.append(image)
        # y[image_index] label corresponds to image_id = paths1[image_index][1]
        y_train.append(tag_label) 
        # Just a simple counter of the number of ones to cofirm with known number of dog images
        if tag_label == 1:
            count_dogs += 1
        check += 1
        # Only display load status a few times throughout the entire pre-process
        if check % 10000 == 0:
            print('path: ', path)
            print('image_id: ', image_id)
            print('No. of Dog labels so far: ', count_dogs) # No. of target labels/tags equal to one
            print('Length X_train: ', len(X_train)) # Length of list of images
            print()
    # Display final outcome of pre-processing
    print('Total No. of Dog labels: ', count_dogs) # No. of target labels/tags equal to one
    print('Final Length X_train: ', len(X_train)) # Length of list of images
    # Save X_train and y_train arrays as txt files in assets folder (just outside of Github repo) to load in Notebook 3
    with open('../../assets/X_training_image_data.txt', 'w') as trainimages:
        trainimages.writelines("%s\n" % img for img in X_train)
    with open('../../assets/y_training_labels.txt', 'w') as trainimages:
        trainimages.writelines("%s\n" % img for img in y_train)

HEADS UP: Could not load taining images/labels from .txt files...
                	...hold on for ~10-15 mins while X and y are prepared

path:  ../../visual_genome_part1/VG_100K_1/2322741.jpg
image_id:  2322741
No. of Dog labels so far:  0
Length X_train:  10000

path:  ../../visual_genome_part1/VG_100K_1/2333207.jpg
image_id:  2333207
No. of Dog labels so far:  0
Length X_train:  20000

path:  ../../visual_genome_part1/VG_100K_1/2343671.jpg
image_id:  2343671
No. of Dog labels so far:  0
Length X_train:  30000

path:  ../../visual_genome_part1/VG_100K_1/2354151.jpg
image_id:  2354151
No. of Dog labels so far:  0
Length X_train:  40000

path:  ../../visual_genome_part1/VG_100K_1/2364610.jpg
image_id:  2364610
No. of Dog labels so far:  0
Length X_train:  50000

path:  ../../visual_genome_part1/VG_100K_1/2375066.jpg
image_id:  2375066
No. of Dog labels so far:  0
Length X_train:  60000

Total No. of Dog labels:  0
Final Length X_train:  64346
Wall time: 24min 31s


In [None]:
# Check that all is copacetic with train set
print('X_train:')
print(f'\tLength: {len(X_train)}\n\
        DType: {type(X_train)}\n\
        Element Shapes: {X_train[0].shape}\n\
        Element DTypes: {type(X_train[0])}\n')

# Show pic
plt.figure(figsize = (8, 8)); 
plt.imshow(X_train[2766], aspect='auto');

#### How to find the correct image(the ith element of X_valid) associated with the known Image ID:

In [None]:
# 43,733 images with IDs beyond range(43_733) --> image_data = X_valid[i]
image_id = 2378691
for i in range(len(X_valid)):
    # 2nd element in each paths1 or paths2 tuple is an 'image_id'
    # X & y created from paths1 or paths2 --> image_id = int(paths2[i][1])image_id
    if int(paths2[i][1]) == image_id:
        print(f'Image Index: {i}')

In [None]:
# How to find the correct image associated with the known image index in X_valid or y_valid
image_index = 2761
print(f'Image ID: {int(paths2[image_index][1])}')