In [1]:
import numpy as np
import PIL
import keras
from keras.preprocessing import image
import os
import pickle
import h5py

In [2]:
# zero pad every file to 5 digits because Google Drive sorts things differently
def rename_file(image):
    num = image.split('_')[1].split('.')[0]
    new = num.zfill(5)
    os.rename(image,f'train_{new}.jpg')

In [3]:
# for file in os.listdir():
#     rename_file(file)

In [4]:
# load each image, resized to 224x224 pixels to match the ResNet50 and VGG 16 model requirements 
# and convert each to a 3d array

os.chdir('../data/train-jpg2')
arr_list = []
for file in sorted(os.listdir()):
    if file.endswith('.jpg'):
        img = image.load_img(file,target_size=(224,224))
        # standardize pixel values based on 255 possible RGB values
        arr = image.img_to_array(img) * (1.0/255.0)  
        arr_list.append(arr)

In [5]:
# stack 3D arrays to become a 4D array
tensor = np.stack([a for a in arr_list])

In [6]:
del arr_list

In [7]:
tensor.shape

(40479, 32, 32, 3)

In [8]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
# turn the image tags into 17-class count vectorized matrix
df = pd.read_csv('../../data/train_v2.csv')

In [10]:
cv = CountVectorizer()
y = cv.fit_transform(df.tags).toarray()

In [11]:
y[y==2] = 1

In [12]:
# split and then write arrays to hdf5 file
X_train, X_test, y_train, y_test = train_test_split(tensor,y,test_size=0.25,random_state=23)

In [13]:
os.chdir('..')
f = h5py.File('tensors_32.h5','w') 
f.create_dataset('X_train',data=X_train)
f.create_dataset('X_test',data=X_test)
f.create_dataset('y_train',data=y_train)
f.create_dataset('y_test',data=y_test)

<HDF5 dataset "y_test": shape (10120, 17), type "<i8">

In [14]:
f.close()