In [None]:
%load_ext autoreload
%autoreload 2
import sys
import numpy as np
import PIL.Image
import matplotlib.pyplot as plt
from os.path import join as oj
import pandas as pd
import pickle as pkl
import models
import util
import os
import config
from config import ATTR_TO_INDEX
import viz
import scipy.stats
from tqdm import tqdm
import figs
import matplotlib.image as mpimg
import sklearn.decomposition
import sklearn.manifold
import seaborn as sns
import data
import transects
import face_recognition
import sklearn.metrics
from matching import *
from matplotlib.image import BboxImage
from matplotlib.transforms import Bbox, TransformedBbox


df = data.load_all_labs()
df = df.set_index('fname_id')

DIR_ORIG = '../data/celeba-hq/ims/'
reg = 0.1
DIR_GEN = oj(f'../data_processed/celeba-hq/generated_images_{reg}')

# get fnames
fname_nps = [f for f in sorted(os.listdir(DIR_GEN)) if 'npy' in f] # these start at 00001
fname_ids = np.array([f[:-4] for f in fname_nps])
idxs_calculated = np.array([int(x) - 1 for x in fname_ids]) # this starts at 0

# trim df to only have the relevant ids
df = df.loc[fname_ids]

In [4]:
# load the linear model in latent space
coefs, intercepts = transects.get_directions()
coefs = np.array(coefs).squeeze()
intercepts = np.array(intercepts)


# load latents and calculate dists
print('loading latents...')
latents = np.array([np.load(oj(DIR_GEN, f)) for f in fname_nps])
lats = get_lat(latents)
preds = latents.mean(axis=1) @ coefs.T + intercepts.T
weights = np.zeros(preds.shape[1])
# print(ATTR_TO_INDEX)
# weights[ATTR_TO_INDEX['skin-color']] = 1e2
vecs = join_vecs(preds, lats, weights)

print('calculating dists...')
dists_gan = get_dists(vecs)
print('done!')

# load pairwise facial dicts
print('loading facial rec dists...')
dists_facial = np.load(open('processed/13_facial_dists_pairwise.npy', 'rb')) # pkl.load()
dists_facial = dists_facial[idxs_calculated, :][:, idxs_calculated]
print('done!')



loading latents...
calculating dists...
done!
loading facial rec dists...
done!


# do matching

### find matching for an im

In [None]:
# img 38 is a good gender example
for im_idx in range(10):# range(38, 45):
    
    # select subset of indices to use for matching
    idxs = np.ones(df.shape[0]).astype(bool)
#     idxs = (df['gender'] > 0).values
#     idxs = (df['race'] == 'Black').values
#     idxs = (df['Eyeglasses'] > 0).values
    dists_im = dists_gan[im_idx][idxs] # first select row, then select vals
#     dists_im = dists_facial[im_idx][idxs]
    fname_ids_for_matching = fname_ids[idxs]    
    
    closest_match_vals, closest_matches_fnames = calc_matches(dists_im, fname_ids_for_matching)
    # print(closest_matches_fnames)
    
    # load images
    N_MATCHES_TO_PLOT = 5
    fname_id = fname_ids[im_idx]
    im_orig = mpimg.imread(oj(DIR_ORIG, f'{fname_id}.jpg'))
    im_rec = mpimg.imread(oj(DIR_GEN, f'{fname_id}.png'))
    im_matches = [mpimg.imread(oj(DIR_GEN, f'{fname_match}.png'))
                  for fname_match in closest_matches_fnames[:N_MATCHES_TO_PLOT]]
    
    # plt images
    util.plot_row([im_orig, im_rec] + im_matches,
                  annot_list=['orig', 'rec'] + closest_match_vals[:N_MATCHES_TO_PLOT].round(3).tolist(), dpi=150)
    plt.show()
    # print(closest_matches, closest_matches_fnames)
# show_matches(dists_gan, DIR_ORIG, DIR_GEN, im_nums=range(5, 10))

**simple dim reduction**

In [None]:
def plot_image(xs, ys, im):
    '''Note: should normalize x/y coords to 0-1 before plotting
    '''
    for idx, (x, y) in tqdm(enumerate(zip(xs, ys))):
        bb = Bbox.from_bounds(x, y, IM_SIZE, IM_SIZE)  
        bb2 = TransformedBbox(bb, ax.transData)
        bbox_image = BboxImage(bb2, origin=None, clip_on=False)
        bbox_image.set_data(im[idx])
#          bbox_image.set_alpha(1.0)
        ax.add_artist(bbox_image)
    return ax

# plot
N_IMS = 1000
IM_SIZE = 0.025
N_PLOT = 1000

fig = plt.figure(figsize=(20, 20), dpi=100)
ax = fig.add_subplot(111)
ims = [mpimg.imread(oj(DIR_ORIG, f))[::2, ::2] for f in df.fname_final[:N_IMS]]
l = sklearn.decomposition.PCA(n_components=2).fit_transform(lats[:N_IMS])
# l = sklearn.manifold.TSNE().fit_transform(lats[:N_IMS])
l = (l - l.min(axis=0)) / (l.max(axis=0) - l.min(axis=0))
ax = plot_image(l[:N_PLOT, 0], l[:N_PLOT, 1], ims)
plt.xlabel('Dim 1')
plt.ylabel('Dim 2')
# plt.xlim((0.5, 1))
# plt.ylim((0, 0.5))
plt.show()

# plots for specific ids/matches

**make some plots for a specific match**

In [None]:
im_idx = 38 # img 38 is a good gender example
fname_id2 = '02638' # this is the match
# if all images are calculated, then this is just 5-char string of im_idx + 1
fname_id = fname_ids[im_idx] 


# idxs
dists_im = dists_gan[im_idx]

im = mpimg.imread(oj(DIR_ORIG, f'{fname_id}.jpg'))
im2 = mpimg.imread(oj(DIR_ORIG, f'{fname_id2}.jpg'))

d1 = df[df.id == df.loc[fname_id].id]
d2 = df[df.id == df.loc[fname_id2].id]



util.plot_row([im, im2])
util.plot_row([mpimg.imread(oj(DIR_ORIG, f)) for f in d1.fname_final])
util.plot_row([mpimg.imread(oj(DIR_ORIG, f)) for f in d2.fname_final])

**look at images / reconstructions of the same person**

In [None]:
d = df[df['count_with_this_id'] >= df['count_with_this_id'].max() - 1]
dd = d[d.id == d.iloc[0].id]
util.plot_row([mpimg.imread(oj(DIR_ORIG, f)) for f in dd.fname_final][:5])
util.plot_row([mpimg.imread(oj(DIR_GEN, f'{index}.png')) for index in dd.index][:5])

# evaluate matching with metrics

**how often do we return the same id?**

In [5]:
dists_match_names = ['facial', 'gan', 'gan_constrained']
d = df[df['count_with_this_id'] > 1]
d = d[[k for k in d.keys()
       if not ('md5' in k or 'file' in k or 'idx' in k or 'fname' in k or 'prob' in k)]] # filter some keys we don't really care about

N_IMS = 100
# print(d.shape)
suffs = ['', '_diff']
accs_keys = ['acc_top1', 'acc_top5', 'acc_top10']
numerical_keys = ['yaw', 'pitch', 'roll', 'background_mean', 'background_std', 'quality']
attr_keys = [kk for kk in d.keys() if not 'scores' in kk and not kk in numerical_keys]

In [6]:
r = {
    k: [] for k in accs_keys + attr_keys + numerical_keys \
        + [kk + '_diff' for kk in attr_keys + numerical_keys]
}

for dists_match_name in dists_match_names:
    if dists_match_name == 'facial':
        dists_match = dists_facial
    elif dists_match_name == 'gan':
        dists_match = dists_gan
    elif dists_match_name == 'gan_constrained':
        dists_match = dists_gan + (dists_facial > 0.6) * 1e3 # constraint for missclassificaiton
    lists = {
        k: [] for k in r.keys()
    }
    print('calculating', dists_match_name)
    for im_idx in tqdm(range(N_IMS)):
        orig = d.iloc[im_idx]
        id_orig = orig.id

        # id retention
        dists_im = dists_match[im_idx]
        matched_idxs = np.argsort(dists_im)
        matched_ids = df.iloc[matched_idxs].id.values # note - this needs to be df not d to get the proper indices from dists
        
        # preserving id
        lists['acc_top1'].append(id_orig in matched_ids[:1])
        lists['acc_top5'].append(id_orig in matched_ids[:5])
        lists['acc_top10'].append(id_orig in matched_ids[:10])
        
        # 2 types of matching
        d_full = d.iloc[matched_idxs[:10]]
        matched_diff_idxs = matched_idxs[matched_ids[matched_ids != id_orig]]
        d_diff = d.iloc[matched_diff_idxs[:10]]
        
        for dd, suff in zip([d_full, d_diff], suffs):
            # binary feats
            for k in attr_keys:
                lists[k + suff].append(np.mean(dd[k] == orig[k]))
            
            # numerical feats
            for k in numerical_keys:
                lists[k + suff].append(np.mean(np.square(dd[k] - orig[k])))
            
    for k in lists.keys():
        r[k].append(np.mean(lists[k]))
r = pd.DataFrame.from_dict(r)
r.to_pickle('processed/13_dist_stats.pkl')

100%|██████████| 100/100 [00:08<00:00, 12.27it/s]
100%|██████████| 100/100 [00:07<00:00, 13.24it/s]
100%|██████████| 100/100 [00:08<00:00, 11.61it/s]


# load and view the matches

In [12]:
r = pd.read_pickle('processed/13_dist_stats.pkl').round(3)
for k in r.keys():
    if not k in numerical_keys and not k.replace('_diff', '') in numerical_keys: # or k == 'quality':
        r[k] *= 100
r.index = dists_match_names
for k in r.keys():
    if r.loc['gan', k] > r.loc['facial', k]:
        print(k, f"{r.loc['gan', k] - r.loc['facial', k]:0.1f}")
# r #.round(3).style.background_gradient()

skin-color 1.6
Bangs 7.8
Blond_Hair 3.2
Eyeglasses 1.6
Pale_Skin 1.6
Wavy_Hair 3.4
Wearing_Earrings 0.8
Wearing_Hat 3.3
Wearing_Necklace 0.5
background_std 55.1
quality 0.0
count_with_this_id_diff 1.6
skin-color_diff 3.2
Bald_diff 0.6
Bangs_diff 6.0
Blond_Hair_diff 2.2
Blurry_diff 0.3
Eyeglasses_diff 1.8
Mouth_Slightly_Open_diff 0.9
Pale_Skin_diff 3.2
Wavy_Hair_diff 1.6
Wearing_Hat_diff 1.2
Wearing_Necklace_diff 1.7
background_mean_diff 451.3
background_std_diff 25.1


In [13]:
r[[k for k in r.keys() if '_diff' not in k]]

Unnamed: 0,acc_top1,acc_top5,acc_top10,id,count_with_this_id,gender,hair-length,facial-hair,makeup,skin-color,...,race4_pred,gender_pred,age_pred,img_names_pred,yaw,pitch,roll,background_mean,background_std,quality
facial,36.0,37.0,38.0,19.7,24.9,78.3,83.5,88.1,51.5,90.7,...,84.5,75.4,65.8,0.0,324.044,119.617,10.571,3002.201,583.441,0.009
gan,7.0,7.0,11.0,1.8,6.6,67.7,81.5,80.8,48.8,92.3,...,79.0,67.9,65.6,0.0,206.811,79.961,7.135,2779.667,638.497,0.011
gan_constrained,22.0,31.0,33.0,12.7,17.9,78.3,83.2,88.1,51.3,91.4,...,82.9,74.7,65.9,0.0,246.399,95.566,8.47,2904.885,534.008,0.009


In [14]:
r[[k for k in r.keys() if '_diff' in k]]

Unnamed: 0,id_diff,count_with_this_id_diff,gender_diff,hair-length_diff,facial-hair_diff,makeup_diff,skin-color_diff,age_diff,5_o_Clock_Shadow_diff,Arched_Eyebrows_diff,...,race4_pred_diff,gender_pred_diff,age_pred_diff,img_names_pred_diff,yaw_diff,pitch_diff,roll_diff,background_mean_diff,background_std_diff,quality_diff
facial,0.0,3.9,78.6,82.3,87.8,54.9,90.4,73.9,86.2,51.7,...,76.7,73.5,61.1,0.0,272.214,115.61,10.305,3158.699,623.192,0.013
gan,0.0,5.5,64.5,81.6,77.9,49.4,93.6,72.2,76.9,51.5,...,74.7,63.2,60.6,0.0,262.385,103.177,8.579,3609.999,648.291,0.012
gan_constrained,0.0,4.1,66.8,81.8,78.1,51.7,93.1,73.0,79.1,48.9,...,71.3,62.5,58.2,0.0,274.481,102.511,8.423,3478.368,550.666,0.011


In [15]:
rename = {
    'acc_top1': 'ID (top1)',
    'race_pred': 'Race',
    'background_mean': 'Background Mean',
}
for k in ['gender', 'quality', 'yaw', 'pitch', 'roll']:
    rename[k] = k.capitalize()
r.index = ['Facial-rec dist', 'GAN dist', 'Combined']
id_attributes = ['acc_top1', 'gender', 'race_pred']
id_correlated_attributes = ['Mustache', 'Eyeglasses', 'Bangs', 'Wearing_Hat']
image_attributes = ['Blurry', 'quality', 'background_mean', 'yaw', 'pitch', 'roll']
attrs = id_attributes + id_correlated_attributes + image_attributes
r2 = r[attrs].rename(columns=rename)

In [16]:
# print(r2.to_latex())
r2

Unnamed: 0,ID (top1),Gender,Race,Mustache,Eyeglasses,Bangs,Wearing_Hat,Blurry,Quality,Background Mean,Yaw,Pitch,Roll
Facial-rec dist,36.0,78.3,75.5,92.2,96.6,77.2,93.7,97.8,0.009,3002.201,324.044,119.617,10.571
GAN dist,7.0,67.7,61.4,92.2,98.2,85.0,97.0,97.8,0.011,2779.667,206.811,79.961,7.135
Combined,22.0,78.3,71.7,92.7,97.4,80.2,96.1,97.9,0.009,2904.885,246.399,95.566,8.47


In [17]:
r_diff = r[[a + '_diff' for a in attrs if f'{a}_diff' in r.keys()]]
r_diff

Unnamed: 0,gender_diff,race_pred_diff,Mustache_diff,Eyeglasses_diff,Bangs_diff,Wearing_Hat_diff,Blurry_diff,quality_diff,background_mean_diff,yaw_diff,pitch_diff,roll_diff
Facial-rec dist,78.6,59.8,91.8,95.0,69.5,95.1,97.5,0.013,3158.699,272.214,115.61,10.305
GAN dist,64.5,58.4,89.6,96.8,75.5,96.3,97.8,0.012,3609.999,262.385,103.177,8.579
Combined,66.8,57.9,88.8,95.8,76.3,94.6,97.9,0.011,3478.368,274.481,102.511,8.423
