# FairFace statistics
* age

(array(['0-2', '10-19', '20-29', '3-9', '30-39', '40-49', '50-59', '60-69',
       'more than 70'], dtype=object), array([ 1792,  9103, 25598, 10408, 19250, 10744,  6228,  2779,   842]))
       
(array(['0-2', '10-19', '20-29', '3-9', '30-39', '40-49', '50-59', '60-69',
       'more than 70'], dtype=object), array([ 199, 1181, 3300, 1356, 2330, 1353,  796,  321,  118]))
       
* gender

(array(['Female', 'Male'], dtype=object), array([40758, 45986]))

(array(['Female', 'Male'], dtype=object), array([5162, 5792]))

* race

(array(['Black', 'East Asian', 'Indian', 'Latino_Hispanic',
       'Middle Eastern', 'Southeast Asian', 'White'], dtype=object), array([12233, 12287, 12319, 13367,  9216, 10795, 16527]))
       
(array(['Black', 'East Asian', 'Indian', 'Latino_Hispanic',
       'Middle Eastern', 'Southeast Asian', 'White'], dtype=object), array([1556, 1550, 1516, 1623, 1209, 1415, 2085]))

get statistics via example code:
```
# print(np.unique(utk_train_label[:, 0], return_counts=True))
# print(np.unique(utk_train_label[:, 2], return_counts=True)[0])
# print(np.unique(utk_train_label[:, 2], return_counts=True)[1] / len(utk_train_label))

# print(np.unique(fair_train_label[:, 0], return_counts=True)[0])

# print(np.unique(fair_train_label[:, 2], return_counts=True)[0])
# print(np.unique(fair_train_label[:, 2], return_counts=True)[1] / len(fair_train_label))



# print(np.unique(utk_train_label[:, 0], return_counts=True)[0])
# print(np.unique(utk_train_label[:, 0], return_counts=True)[1] / len(utk_train_label))

# print(np.unique(fair_train_label[:, 0], return_counts=True)[0])
# print(np.unique(fair_train_label[:, 0], return_counts=True)[1] / len(fair_train_label))
```

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.utils.data as data
from PIL import Image
import pickle
from tqdm import tqdm

In [2]:
fair_train_img, fair_train_label, fair_val_img, fair_val_label = pickle.load(
    open("raw/fairface_train_val_pair.pkl", "rb"))
fair_train_img, fair_train_label = np.array(fair_train_img), np.array(fair_train_label)
fair_val_img, fair_val_label = np.array(fair_val_img), np.array(fair_val_label)
utk_train_img, utk_train_label = pickle.load(
    open("raw/utkface_train_pair.pkl", "rb"))
utk_train_img, utk_train_label = np.array(utk_train_img), np.array(utk_train_label)

# Unify labels

In [3]:
def unify_race_label(race):
    if 'Asian' in race:
        return 'Asian'
    elif race == 'Latino_Hispanic':
        return 'Others'
    elif race == 'Middle Eastern':
        return 'Others'
    else:
        return race

In [4]:
def unify_age_label(age):
    if age in ['0-2', '10-19', '20-29', '3-9']:
        return 'Young'
    return 'Old'

In [5]:
x = map(unify_age_label, fair_train_label[:, 0])
fair_train_label[:, 0] = np.array(list(x))
x = map(unify_age_label, utk_train_label[:, 0])
utk_train_label[:, 0] = np.array(list(x))
x = map(unify_age_label, fair_val_label[:, 0])
fair_val_label[:, 0] = np.array(list(x))

In [6]:
x = map(unify_race_label, fair_train_label[:, 2])
fair_train_label[:, 2] = np.array(list(x))
x = map(unify_race_label, fair_val_label[:, 2])
fair_val_label[:, 2] = np.array(list(x))

# random select into equal size

In [7]:
np.random.seed(2022)
permute_idx = np.random.permutation(len(fair_train_img))
fair_train_img_set1, fair_train_label_set1 = fair_train_img[permute_idx[:len(utk_train_img)]], fair_train_label[permute_idx[:len(utk_train_img)]]
fair_train_img_set2, fair_train_label_set2 = fair_train_img[permute_idx[len(utk_train_img):2 * len(utk_train_img)]], fair_train_label[permute_idx[len(utk_train_img):2 * len(utk_train_img)]]
fair_train_img_set_rest, fair_train_label_set_rest = fair_train_img[permute_idx[len(utk_train_img):]], fair_train_label[permute_idx[len(utk_train_img):]]

# np.random.seed(2022)
# fair_subset_idx = np.random.choice(len(fair_train_img), len(utk_train_img) * 2, replace=False)
# fair_train_img_set1, fair_train_label_set1 = fair_train_img[fair_subset_idx[:len(utk_train_img)]], fair_train_label[fair_subset_idx[:len(utk_train_img)]]
# fair_train_img_set2, fair_train_label_set2 = fair_train_img[fair_subset_idx[len(utk_train_img):]], fair_train_label[fair_subset_idx[len(utk_train_img):]]

# Resize to 128 * 128

In [8]:
im_size = 128
transform = transforms.Compose([transforms.Resize(im_size), transforms.ToTensor()])
# to_pil = transforms.ToPILImage()

In [9]:
fair_train_img_set1_tensor = []
for x in fair_train_img_set1:
    fair_train_img_set1_tensor.append(transform(Image.fromarray(x)))
fair_train_img_set1_tensor = torch.stack(fair_train_img_set1_tensor)

fair_train_img_set2_tensor = []
for x in fair_train_img_set2:
    fair_train_img_set2_tensor.append(transform(Image.fromarray(x)))
fair_train_img_set2_tensor = torch.stack(fair_train_img_set2_tensor)

fair_train_img_set_rest_tensor = []
for x in fair_train_img_set_rest:
    fair_train_img_set_rest_tensor.append(transform(Image.fromarray(x)))
fair_train_img_set_rest_tensor = torch.stack(fair_train_img_set_rest_tensor)

fair_val_img_tensor = []
for x in fair_val_img:
    fair_val_img_tensor.append(transform(Image.fromarray(x)))
fair_val_img_tensor = torch.stack(fair_val_img_tensor)
    
utk_train_img_tensor = []
for x in utk_train_img:
    utk_train_img_tensor.append(transform(Image.fromarray(x)))
utk_train_img_tensor = torch.stack(utk_train_img_tensor)

In [10]:
fair_train_img_set2_tensor.shape, fair_train_img_set_rest_tensor.shape

torch.Size([23705, 3, 128, 128])

# Encoding labels

In [11]:
# omit the race
fair_train_label_set1 = np.array(list(map('_'.join, fair_train_label_set1)))
fair_train_label_set2 = np.array(list(map('_'.join, fair_train_label_set2)))
fair_train_label_set_rest = np.array(list(map('_'.join, fair_train_label_set_rest)))
fair_val_label = np.array(list(map('_'.join, fair_val_label)))
utk_train_label = np.array(list(map('_'.join, utk_train_label)))
assert len(np.unique(fair_train_label_set1)) == len(np.unique(fair_train_label_set2)) == len(np.unique(fair_train_label_set_rest)) == len(np.unique(fair_val_label)) == len(np.unique(utk_train_label))
print(np.unique(fair_train_label_set1, return_counts=True))

(array(['Old_Female_Asian', 'Old_Female_Black', 'Old_Female_Indian',
       'Old_Female_Others', 'Old_Female_White', 'Old_Male_Asian',
       'Old_Male_Black', 'Old_Male_Indian', 'Old_Male_Others',
       'Old_Male_White', 'Young_Female_Asian', 'Young_Female_Black',
       'Young_Female_Indian', 'Young_Female_Others', 'Young_Female_White',
       'Young_Male_Asian', 'Young_Male_Black', 'Young_Male_Indian',
       'Young_Male_Others', 'Young_Male_White'], dtype='<U19'), array([ 958,  700,  710, 1135,  921, 1314,  719,  929, 2153, 1378, 2136,
        938,  916, 1492, 1222, 1908,  965,  808, 1429,  974]))


In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(fair_train_label_set1)
print(le.classes_.shape)
fair_train_label_set1 = torch.tensor(le.transform(fair_train_label_set1))
fair_train_label_set2 = torch.tensor(le.transform(fair_train_label_set2))
fair_train_label_set_rest = torch.tensor(le.transform(fair_train_label_set_rest))
fair_val_label = torch.tensor(le.transform(fair_val_label))
utk_train_label = torch.tensor(le.transform(utk_train_label))

(20,)


In [13]:
pickle.dump((fair_train_img_set1_tensor, fair_train_label_set1), open("fairface_set1_tensor.pkl", "wb"))
pickle.dump((fair_train_img_set2_tensor, fair_train_label_set2), open("fairface_set2_tensor.pkl", "wb"))
pickle.dump((fair_train_img_set_rest_tensor, fair_train_label_set_rest), open("fairface_set_rest_tensor.pkl", "wb"))
pickle.dump((fair_val_img_tensor, fair_val_label), open("fairface_val_tensor.pkl", "wb"))
pickle.dump((utk_train_img_tensor, utk_train_label), open("utk_tensor.pkl", "wb"))

# Similarity datasets

In [14]:
set1_num = len(fair_train_img_set1_tensor)
for intersect_proportion in [1.0, 0.8, 0.6, 0.4, 0.2, 0.0]:
    shift = int(intersect_proportion * set1_num)
    print(f"Sample {set1_num - shift} to {set1_num} from set1, 0 to {set1_num - shift} from set2.")
    X_tensor = torch.cat([fair_train_img_set1_tensor[set1_num - shift:], fair_train_img_set2_tensor[:set1_num - shift]])
    y_tensor = torch.cat([fair_train_label_set1[set1_num - shift:], fair_train_label_set2[:set1_num - shift]])
    pickle.dump((X_tensor, y_tensor), open(f"fairface_similarity/intersect_{intersect_proportion}.pkl", "wb"))


Sample 0 to 23705 from set1, 0 to 0 from set2.
Sample 4741 to 23705 from set1, 0 to 4741 from set2.
Sample 9482 to 23705 from set1, 0 to 9482 from set2.
Sample 14223 to 23705 from set1, 0 to 14223 from set2.
Sample 18964 to 23705 from set1, 0 to 18964 from set2.
Sample 23705 to 23705 from set1, 0 to 23705 from set2.


In [15]:
set1_num = len(fair_train_img_set1_tensor)
for intersect_proportion in [1.0, 0.8, 0.6, 0.4, 0.2, 0.0]:
    shift = int(intersect_proportion * set1_num)
    print(f"Sample {set1_num - shift} to {set1_num} from set1, 0 to {set1_num - shift} from set2.")
    X_tensor = torch.cat([fair_train_img_set1_tensor[set1_num - shift:], utk_train_img_tensor[:set1_num - shift]])
    y_tensor = torch.cat([fair_train_label_set1[set1_num - shift:], utk_train_label[:set1_num - shift]])
    pickle.dump((X_tensor, y_tensor), open(f"fairface_utk_mix/intersect_{intersect_proportion}.pkl", "wb"))


Sample 0 to 23705 from set1, 0 to 0 from set2.
Sample 4741 to 23705 from set1, 0 to 4741 from set2.
Sample 9482 to 23705 from set1, 0 to 9482 from set2.
Sample 14223 to 23705 from set1, 0 to 14223 from set2.
Sample 18964 to 23705 from set1, 0 to 18964 from set2.
Sample 23705 to 23705 from set1, 0 to 23705 from set2.
