[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/codeteme/img_clf_data_sprint_51/blob/main/scripts/imgdata_to_nparray.ipynb)

In [None]:
!pip install --upgrade pip
!pip install kaggle

In [None]:
!chmod 600 ../kaggle/kaggle.json

In [None]:
!kaggle datasets download temesgentewolde/animal-dataset-intermediate -p ../data/raw

In [None]:
!unzip ../data/raw/animal-dataset-intermediate.zip -d ../data/raw

In [None]:
!rm ../data/raw/animal-dataset-intermediate.zip

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import pathlib
import os
import PIL
import PIL.Image

In [None]:
data_dir = "../data/raw/animal_dataset_intermediate/train"
data_dir = pathlib.Path(data_dir)
print(data_dir)

In [None]:
num_skipped = 0
for folder_name in ("elefante", "farfalla", "mucca", "pecora", "scoiattolo"):
    folder_path = os.path.join(data_dir, folder_name)
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            fobj = open(fpath, "rb")
            is_jfif = tf.compat.as_bytes("JFIF") in fobj.peek(10)
        finally:
            fobj.close()

        if not is_jfif:
            num_skipped += 1
            # Delete corrupted image
            os.remove(fpath)

print("Deleted %d images" % num_skipped)

In [None]:

image_count = len(list(data_dir.glob('*/*.jpg')) + list(data_dir.glob('*/*.jpeg')))
print("Imported image_count: ", image_count)

In [None]:
elephant = list(data_dir.glob('elefante/*'))
PIL.Image.open(str(elephant[0]))

In [None]:
batch_size = 32
img_height = 256
img_width = 256


train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2, # We train our model using 80% of the train_ds and test on the remaining 20%.
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

Let's check if OS and PIL libraries are correctly reading and displying the images. 

In [None]:
class_names = ['elefante', 'farfalla', 'mucca', 'pecora', 'scoiattolo']

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(class_names[labels[i]])
    plt.axis("off")
plt.show()


In [None]:
# Get images(x) and lables(y) of a given batchdatasets
def get_image_label(ds): 
    x_train_ = []
    y_train_ = []
    for element in ds.as_numpy_iterator(): 
        x_train_.append(element[0])
        y_train_.append(element[1])
    x_train = np.concatenate(x_train_)
    y_train = np.concatenate(y_train_)
    
    return (x_train, y_train)

x_train, y_train = get_image_label(train_ds)
x_val, y_val = get_image_label(val_ds)

print(type(x_train), type(y_train))
print(x_train.shape, y_train.shape, x_train.ndim)
print(x_val.shape, y_val.shape, x_val.ndim)

In [None]:
with open('../data/interim/img_as_array.npy', 'wb') as f:
    np.save(f, x_train)
    np.save(f, y_train)
    np.save(f, x_val)
    np.save(f, y_val)