# Import Images

Batch script to import images from a list of files and pack it into an array for data analysis

In [23]:
import h5py
from PIL import Image
import numpy as np
import os
import glob

### Development REPL

In [24]:
taxonomies = glob.glob("Images/*")
for i in range(len(taxonomies)):
    taxonomy = taxonomies[i]
    break
taxonomy

'Images/NotBird'

In [25]:
for image in glob.glob(os.path.join(taxonomy,"*")):
    break

In [26]:
image

'Images/NotBird/NotBird_19997.jpg'

In [27]:
image_list = list()
classification_list = list()

In [28]:
image_ = Image.open(image)

In [29]:
image_list.append(np.asarray(image_))

In [30]:
np.asarray(image_)

array([[[244, 242, 243],
        [229, 227, 228],
        [240, 238, 239],
        ..., 
        [254, 252, 253],
        [254, 252, 255],
        [250, 248, 251]],

       [[255, 254, 255],
        [248, 246, 247],
        [254, 252, 253],
        ..., 
        [253, 251, 252],
        [255, 254, 255],
        [255, 254, 255]],

       [[251, 250, 248],
        [245, 244, 242],
        [255, 255, 253],
        ..., 
        [250, 250, 248],
        [255, 255, 255],
        [255, 255, 255]],

       ..., 
       [[ 39,  52,  68],
        [ 41,  54,  70],
        [ 43,  55,  69],
        ..., 
        [106, 105, 110],
        [102,  99, 106],
        [ 94,  93,  99]],

       [[ 44,  58,  71],
        [ 46,  60,  73],
        [ 48,  60,  74],
        ..., 
        [100, 100, 108],
        [ 95,  93, 104],
        [ 86,  86,  96]],

       [[ 49,  63,  76],
        [ 51,  65,  78],
        [ 54,  66,  80],
        ..., 
        [101, 101, 109],
        [ 89,  89, 101],
        [ 78,  78,

## Batch Import

In [12]:
image_list = list()
classification_list = list()

taxonomies = glob.glob("Images/*")
for i in range(len(taxonomies)):
    taxonomy = taxonomies[i]
    
    classification = np.zeros(len(taxonomies))
    classification[i]=1.0
        
    for image in glob.glob(os.path.join(taxonomy,"*.jpg")):
        image_ = Image.open(image)
        image_list.append(np.asarray(image_)/255)
        classification_list.append(classification)

In [13]:
# Check that we imported everything equally.
assert len(image_list)==len(classification_list)

In [14]:
image_list = np.asarray(image_list)
classification_list = np.asarray(classification_list)

In [15]:
hdf_dataset = "imported_dataset.hdf5"

opts=dict()
opts["compression"]="gzip"
opts["compression_opts"]=9
with h5py.File(hdf_dataset, "w") as fid:
    fid.create_dataset("images", data=image_list, **opts)
    fid.create_dataset("classifications", data=classification_list, **opts)