In [12]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from matplotlib.patches import Rectangle
import cv2
from skimage import color
from skimage.io import imread
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm
from tqdm.auto import tqdm as tqdm_nn
import random
import requests
from requests.exceptions import ConnectionError
from requests.exceptions import ReadTimeout

In [2]:
def clip_image(url, bbox):
    image = imread(url)
    
    x1 = int(bbox[0])
    y1 = int(bbox[1])
    x2 = int(bbox[2])
    y2 = int(bbox[3])
    
    w=x2-x1
    h=y2-y1
    
    crop_img = image[y1:y1+h, x1:x1+w]
    new_size = 224,224
    crop_img = cv2.resize(crop_img, new_size, interpolation=cv2.INTER_CUBIC)
    plt.figure()
    plt.imshow(crop_img)

In [3]:
def bbox_on_img(url, bbox):
    image = imread(url)
    plt.figure()
    plt.imshow(image)
    x1 = bbox[0]
    y1 = bbox[1]
    x2 = bbox[2]
    y2 = bbox[3]
    
    w=x2-x1
    h=y2-y1
    rect = Rectangle((x1, y1), w, h, fill=False, color='r')
    plt.axes().add_patch(rect)
    plt.show()   

In [4]:
def process_celeb(name, path):
    # format is 
    # Each line represents:
    # id url left top right bottom pose detection_score curation
    # id: Integer id for an image 
    # url: The weblink for the image. 
    # [left top right bottom] the bounding box for an image. 
    # pose: frontal/profile (pose>2 signifies a frontal face while
    # pose<=2 represents left and right profile detection).
    # detection score: Score of a DPM detector.
    # curation: Whether this image was a part of final curated dataset
    
    df = pd.read_csv(path, sep=' ', names=['ID', 'URL', 'LEFT', 'TOP', 'RIGHT', 'BOTTOM', 'POSE', 'DETECTION', 'CURATION'])
    celeb_list = [name] * len(df)
    df['CELEB'] = celeb_list
    return df
        

In [5]:
def get_celeb_box(name, img_num):
    the_box_row = df.loc[df['CELEB']==name]
    the_box_row = the_box_row.loc[img_num]
    return the_box_row[['LEFT', 'TOP', 'RIGHT', 'BOTTOM']].values

In [6]:
def get_celeb_url(name, img_num):
    the_url_row = df.loc[df['CELEB']==name].loc[img_num]['URL']
    return the_url_row

In [22]:
def url_ok(url):
    try:
        r = requests.head(url, timeout=timeout)
        return r.status_code == 200
    except (ConnectionError, ReadTimeout)  as e:
        #print("URL connection error", url)
        return False

In [8]:
path = 'vgg_face_dataset/files'
for (dirpath, dirnames, filenames) in os.walk(path):
    df = None
    for filename in tqdm(filenames):
        if filename.endswith('.txt'): 
            celeb_name = os.path.splitext(filename)[0]
            full_path = os.sep.join([dirpath, filename])
            #print("Processing: ["+celeb_name+"]: full path: ["+full_path+"]")
            if df is not None:
                df = df.append(process_celeb(celeb_name,  full_path))
            else:
                df = process_celeb(celeb_name, full_path)


HBox(children=(IntProgress(value=0, max=2623), HTML(value='')))




In [24]:
df.to_csv('vgg_face_full.csv')

In [9]:
print('Num Celeb Images', str(len(df)))

Num Celeb Images 2604849


In [23]:
tqdm_nn.pandas()
df['VALID_URL'] = df['URL'].progress_apply(url_ok)
len(df[df.VALID_URL==True])

HBox(children=(IntProgress(value=0, max=2604849), HTML(value='')))

RecursionError: maximum recursion depth exceeded in comparison

In [None]:
celeb_names = df.CELEB.unique()
print(celeb_names)

In [None]:
rand_celeb = random.randint(0, len(celeb_names))
rand_pic_num = random.randint(0, 1000) # 1000 pics
celeb_name = celeb_names[rand_celeb]
c_num = rand_pic_num

c_box =  get_celeb_box(celeb_name, c_num)
print(c_box)
c_url= get_celeb_url(celeb_name, c_num)
print(c_url)
print('Random Celeb', celeb_name)
print('Random Pic Num', str(c_num))
bbox_on_img(c_url, c_box)
clip_image(c_url, c_box)