In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import MaxPooling2D

import tensorflow.keras.backend as K
import tensorflow as tf

import cv2
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
import os
import tensorflow as tf
import keras as keras
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
import src.proprietary_functions as src

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Lambda
from tensorflow.keras.datasets import mnist



In [2]:
image_file = './sample_data/Img/'
annotation_file = './sample_data/Anno/'
export_file = './export'

identity_file = annotation_file + 'identity_CelebA.txt'
bbox_file = annotation_file + 'list_bbox_celeba.txt'

# Train/test split variables
random_seed = 123
test_size = 0.2
validation_size = 0.2


image_id_col = 'image_id'
bbox_col_names = {
    'x_start' : 'x_1',
    'y_start' : 'y_1',
    'width' : 'width',
    'height' : 'height',
    'x_end' : '',
    'y_end' : ''}

identity_file = annotation_file + 'identity_CelebA.txt'
bbox_file = annotation_file + 'list_bbox_celeba.txt'



# Loading dataset metadata
identity = pd.read_csv(identity_file, sep=" ", header = None,names=['image', 'image_id'])
bbox = pd.read_csv(bbox_file, delim_whitespace=True)


#%% Filtering faces that appear at least 20 times
labels_annot = pd.DataFrame(identity.image_id.value_counts(ascending=True)).query('image_id > 20').index.tolist()
identity_filtered = identity[identity['image_id'].isin(labels_annot)]

#%% [SPRINT 2] Train/test split of the annotations
imgs = identity_filtered['image']
labels = identity_filtered['image_id']

temp_imgs, test_imgs, _, test_labels = train_test_split(imgs, labels,
                                               test_size = test_size,
                                               random_state = random_seed,        
                                               stratify = labels)
train_imgs, valid_imgs, train_labels, valid_labels = train_test_split(temp_imgs, _,
                                               test_size = validation_size/(1-test_size),
                                               random_state = random_seed,        
                                               stratify = _)

#%% 
# Safe train/test split

if not os.path.exists(export_file):
    os.makedirs(export_file)

if not os.path.exists(export_file + '/setting'):
    os.makedirs(export_file + '/setting')

if export_file != '':
    train_imgs.to_csv(export_file + '/setting/train_imgs.csv', index = False)
    valid_imgs.to_csv(export_file + '/setting/valid_imgs.csv', index = False)
    test_imgs.to_csv(export_file + '/setting/test_imgs.csv', index = False)

In [None]:
#Filtering training set - both images and labels as data frame.
training_set = identity_filtered[identity_filtered['image'].isin(train_imgs)]

In [None]:
#Extracting 100 labels having the most pictures - we gonna use only part of the training set.
labs = list(train_labels.value_counts().head(1000).index)

In [None]:
#extracting randomly 6 pictures per each label.
random.seed(12496)
pics = {i:[random.choices(list(training_set.loc[training_set['image_id'] == i,'image']), k = 10)] for i in labs}

In [None]:
#combining both filtered pictures and labels into a data frame.
subset_df = pd.DataFrame(pics).transpose()
subset_df.index = labs
subset_df.columns = ['pics']
subset_df[[f'pic_{i}' for i  in range(1,11)]] = pd.DataFrame(subset_df.pics.tolist(), index= subset_df.index)
subset_df = subset_df.drop('pics',axis = 1)
subset_df = subset_df.stack().reset_index().drop('level_1', axis=1).rename(columns = {'level_0':'label', 0: 'image'})
subset_df

Unnamed: 0,label,image
0,2820,001553.jpg
1,2820,003568.jpg
2,2820,008286.jpg
3,2820,034769.jpg
4,2820,043941.jpg
...,...,...
9995,7233,009730.jpg
9996,7233,093995.jpg
9997,7233,118662.jpg
9998,7233,019452.jpg


In [None]:
#Extracting the filtered labels and pictures.
imgs_pn, labels_pn = subset_df['image'], subset_df['label']

In [None]:
#From the filtered subset of training set, we split this subset into training, validation and test set.
temp_X, test_X, temp_Y, test_Y = train_test_split(imgs_pn, labels_pn,
                    test_size = 0.2,random_state = random_seed, stratify = labels_pn)

train_X, valid_X, train_Y, valid_Y = train_test_split(temp_X, temp_Y,
                    test_size = 0.25,random_state = random_seed, stratify = temp_Y)

In [None]:
#Creating paths for saving cropped images from the filtered subsets.
if not os.path.exists('./cropped/train/'):
    os.makedirs('./cropped/train/')

if not os.path.exists('./cropped/valid/'):
    os.makedirs('./cropped/valid/')

if not os.path.exists('./cropped/test/'):
    os.makedirs('./cropped/test/')

In [None]:
def cropping(imgs, bboxes, sample):
    
    def face_crop(image_name, bbox_df):
        # Loading Image
        image_path = './data/Img/img_celeba/' + image_name
        img = cv2.imread(image_path)

    # Setting bounding box coordinates
        startX = bbox_df[bbox_df['image_id'] == image_name]['x_1'].values[0]
        startY = bbox_df[bbox_df['image_id'] == image_name]['y_1'].values[0]
        endX = startX + bbox_df[bbox_df['image_id'] == image_name]['width'].values[0]
        endY = startY + bbox_df[bbox_df['image_id'] == image_name]['height'].values[0]
    
    #Cropping and saving boounding box
        crop_img = img[startY:endY, startX:endX]
        output_img = crop_img

        output_img = cv2.resize(crop_img, (224, 224))

        return output_img

    cropped_pics = []
    pic_names = []

    for pic in imgs:
        crop_img = face_crop(pic, bboxes)
        cropped_pics.append(crop_img)
        pic_names.append(pic)

        cv2.imwrite(f'./cropped/{sample.lower()}/{pic}', crop_img)

    return cropped_pics, pic_names
        

In [None]:
cropped_train_X, train_X_names = cropping(train_X, bbox, 'train')
cropped_valid_X, valid_X_names = cropping(valid_X, bbox, 'valid')
cropped_test_X, test_X_names = cropping(test_X, bbox, 'test')

In [None]:
if not os.path.exists('./cropped_numpys/'):
    os.makedirs('./cropped_numpys/')

In [None]:
print('Number of training instances:',len(cropped_train_X))
print('Number of validation instances:',len(cropped_valid_X))
print('Number of test instances:',len(cropped_test_X))

Number of training instances: 6000
Number of validation instances: 2000
Number of test instances: 2000


In [None]:
#converting list of images into arrays
arr_train_X = np.array(cropped_train_X)
arr_valid_X = np.array(cropped_valid_X)
arr_test_X = np.array(cropped_test_X)

In [None]:
#converting list of labels into arrays
arr_train_Y = np.array(train_Y)
arr_valid_Y= np.array(valid_Y)
arr_test_Y = np.array(test_Y)

In [None]:
for npy ,sample in zip([arr_train_X, arr_valid_X, arr_test_X],  ['train','valid','test']):
    with open(f'./cropped_numpys/cropped_{sample}_X.npy', 'wb') as f:
        np.save(f, npy)

In [None]:
for lst ,sample in zip([train_X_names, valid_X_names, test_X_names],  ['train','valid','test']):
    with open(f'./cropped_numpys/cropped_{sample}_X_names.npy', 'wb') as f:
        np.save(f, np.array(lst))

In [None]:
for lab ,sample in zip([arr_train_Y, arr_valid_Y, arr_test_Y],  ['train','valid','test']):
    with open(f'./cropped_numpys/cropped_{sample}_Y.npy', 'wb') as f:
        np.save(f, lab)