## Input Preparation - Chips
In this notebook, take directories of sea lion image chips and convert them to format ready for machine learning training.

In [75]:
import cv2
import pathlib
import pickle
import random

##### Use pathlib to inspect files in the bbox_chips directory, and glob together one list of all the images....
*** 
On multiple runs, the pkl files grew too large so editing code to take 2000 chips of each class.

In [76]:
classes = ['adult_females', 'adult_males', 'juveniles', 'pups', 'subadult_males']

In [77]:
images_root = pathlib.Path('../../results/bbox_chips')
print('Chip Folders: ', '\n', '-' * 50)

for item in images_root.iterdir():
    print(item)
     
#all_image_paths = list(images_root.glob('*/*'))               # TOO MANY - PKL FILES GREW TOO BIG
#all_image_paths = [str(path) for path in all_image_paths]     # TOO MANY - PKL FILES GREW TOO BIG

Chip Folders:  
 --------------------------------------------------
..\..\results\bbox_chips\adult_females
..\..\results\bbox_chips\adult_males
..\..\results\bbox_chips\juveniles
..\..\results\bbox_chips\pups
..\..\results\bbox_chips\subadult_males


In [78]:
all_image_paths = []

for category in classes:
    
    image_paths = list(images_root.glob(category+'/*'))
    image_paths = image_paths[0:2000]
    print(len(image_paths))
    all_image_paths.extend(image_paths)
all_image_paths

random.seed(42)
random.shuffle(all_image_paths)  # shuffle now for train test split later

print('-' * 50)
print(f'Total image count is {len(all_image_paths)}')

2000
2000
2000
2000
2000
--------------------------------------------------
Total image count is 10000


##### Create label index codes. Get names from the folder directories.

In [79]:
label_names = sorted(item.name for item in images_root.glob('*/')
                     if item.is_dir())
label_to_index = dict((name, index) for index, name in enumerate(label_names))
label_to_index

{'adult_females': 0,
 'adult_males': 1,
 'juveniles': 2,
 'pups': 3,
 'subadult_males': 4}

##### Create labels for all chips

In [80]:
all_image_labels = [
    label_to_index[pathlib.Path(path).parent.name] for path in all_image_paths
]
all_image_labels[0:10] # sample of the labels

[1, 3, 3, 0, 1, 0, 3, 1, 1, 4]

##### Read in each chip filename as an image array with OpenCV.

In [81]:
#len(all_image_arrays),len(all_image_labels)

In [82]:
#all_image_arrays, bad_images = [], []

In [83]:
def img_to_array(img_path):
    img = cv2.imread(str(img_path))
    img = cv2.resize(img, (80,80))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).ravel()
    return img

In [84]:
# all_image_arrays, bad_images = [], []
# for i in all_image_paths:
#     try:
#         image_array = img_to_array(i)
#         all_image_arrays.append(image_array)
#     except:
#         print('bad image = ',str(i))
#         bad_images.append(str(i))
#         bad_img_index = all_image_paths.index(i)
#         del all_image_labels[bad_img_index] # drop image label out of list for the bad image
#         continue

In [85]:
#bad_images

In [86]:
## List comprehension wasn't catching errors so try traditional method
#for i in all_image_paths:
#    cv2.imread(str(i))
    

In [87]:
all_image_arrays = [cv2.imread(str(i)) for i in all_image_paths
                    ]  # Read each path in as an image array
all_image_arrays = [cv2.resize(img, (80, 80)) for img in all_image_arrays
                    ]  # Make each chip uniform in size
all_image_arrays = [
    cv2.cvtColor(i, cv2.COLOR_BGR2GRAY).ravel() for i in all_image_arrays
]  # Convert each array to grayscale (one channel), and flatten out to a 1-d array

##### Save image labels and image arrays to pickles for next step - machine learning model training and testing!

In [88]:
with open('image_labels.pkl', 'wb') as f:
    pickle.dump(all_image_labels, f)
with open('image_arrays.pkl', 'wb') as f:
    pickle.dump(all_image_arrays, f)

## Move onto next notebook - we will load these pkl files in to retrieve our labels and arrays.