### CelebA Binary Classification

- Reduces the CelebA dataset to two classes: face images with eye-wear (class=1) and without (class=0) 
- Balances the classes 
- Stores the indices of images in the train and test sets in .pickle files so that the full dataset can be sub-sampled with torchvision's dataloader interface. 

#### Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import random

from pathlib import Path

#### Load and display data

In [2]:
main_folder = './'
images_folder = main_folder + 'img_align_celeba'

In [3]:
df = pd.read_csv(main_folder + 'list_attr_celeba.csv')
df.replace(to_replace=-1, value=0, inplace=True) #replace -1 by 0
df.head()

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,0,1,1,0,0,0,0,0,0,...,0,1,1,0,1,0,1,0,0,1
1,000002.jpg,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
2,000003.jpg,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
3,000004.jpg,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
4,000005.jpg,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


#### Isolate class of interest: wearing eyeglasses or not

In [4]:
df_partition = pd.read_csv(main_folder + 'list_eval_partition.csv')
df_partition['partition'].value_counts()
df_partition.head()

Unnamed: 0,image_id,partition
0,000001.jpg,0
1,000002.jpg,0
2,000003.jpg,0
3,000004.jpg,0
4,000005.jpg,0


In [5]:
y= df['Eyeglasses'].to_list()

In [6]:
df_par_attr = df_partition.join(df['Eyeglasses'], how='inner')
df_par_attr.head()

Unnamed: 0,image_id,partition,Eyeglasses
0,000001.jpg,0,0
1,000002.jpg,0,0
2,000003.jpg,0,0
3,000004.jpg,0,0
4,000005.jpg,0,0


In [7]:
df_par_attr['partition'].value_counts()

0    162770
2     19962
1     19867
Name: partition, dtype: int64

#### Get the size of the smallest class for each partition

In [8]:
train_size, val_size, test_size = df_par_attr.groupby(
    ['partition', 'Eyeglasses']
).image_id.nunique().reset_index()\
.query('Eyeglasses == 1').image_id.tolist()

print(train_size, val_size + test_size)

10521 2672


#### Get and store indices for training and test sets

In [9]:
attr = 'Eyeglasses'

def get_image_ids_and_labels(training_df, partitions, sizes, ys, seed=0):
    
    pos_sample, neg_sample = [], []
    
    for i in range(len(sizes)):
        
        pos_sample += training_df[
            (training_df['partition'] == partitions[i]) & (training_df[attr] == 1)
        ].sample(sizes[i], random_state=seed).index.tolist()

        neg_sample += training_df[
            (training_df['partition'] == partitions[i]) & (training_df[attr] == 0)
        ].sample(sizes[i], random_state=seed).index.tolist()

    sample_xs = pos_sample + neg_sample
    
    sample_ys = [ys[y_idx] for y_idx in sample_xs]
    
    return {'idx': sample_xs, 'y': sample_ys}

In [10]:
train_dict = get_image_ids_and_labels(df_par_attr, [0], [train_size], y)
test_dict = get_image_ids_and_labels(df_par_attr, [1, 2], [val_size, test_size], y)

In [11]:
filename = main_folder+ '/subsample_train_indices.pickle'

with open(filename, 'wb') as f:
    pickle.dump(train_dict, f)

filename = main_folder+ '/subsample_test_indices.pickle'

with open(filename, 'wb') as f:
    pickle.dump(test_dict, f)