In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# This file contains all the main external libs we'll use
from fastai.imports import *

In [3]:
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

import urllib
from PIL import Image

In [4]:
PATH = 'data/tawkify/'
image_folder = f'{PATH}train/'
data_csv = PATH+'tw_dem_images.csv'
image_csv = PATH+'image_csv.csv'

### Load Data

In [25]:
# df = pd.read_csv(PATH+'Photo_Income_Age_2.8.18.csv')
df = pd.read_csv(data_csv, dtype={'local_photo': 'object', 'image_type': 'object'})

In [26]:
uid = 55801
row = df.loc[df.userID == uid].iloc[0]

In [27]:
row

Unnamed: 0                                                       93747
userID                                                           55801
gender                                                            male
attractedToGender                                               female
fromState                                                           NY
metro_name                                                  NYC Queens
User_Age                                                            44
User_Photo           https://s3-us-west-1.amazonaws.com/tawkifyfile...
User_Income                                                     150000
local_photo                           data/tawkify/corrupted/55801.jpg
image_type                                                   corrupted
Name: 93747, dtype: object

In [18]:
df.loc[row.name]
df.at[row.name, 'image_type'] = 'corrupted'

In [15]:

#     img = img_df.at[idx, 'local_photo']
#     if img == 'corrupted':
#         return
#     print(img)
#     if len(plt.imread(img).shape) >= 3:
#         print('Not bad file:', img)
#         return

corrupted_ids = [55801]
for uid in corrupted_ids:
    row = df.loc[df.userID == uid].iloc[0]
    
    file = row.local_photo
    shape = plt.imread(file).shape
#     if len(shape) == 3:
#         continue
    print(file)
    corrupted_file = file.replace('train', 'corrupted')
    df.at[row.name, 'local_photo'] = corrupted_file
    df.at[row.name, 'image_type'] = 'corrupted'
    try:
        os.rename(file, corrupted_file)
    except Exception as e:
        pass

data/tawkify/train/55801.jpg


### Explore Data

In [None]:
df.columns

In [None]:
df['userID'].size

In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,userID,gender,attractedToGender,fromState,metro_name,User_Age,User_Photo,User_Income,local_photo,image_type
0,0,996294,female,male,CA,SF Peninsula,42.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,80000.0,data/tawkify/train/996294.jpg,corrupted
1,1,995253,female,male,CA,SF South Bay,45.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,0.0,data/tawkify/train/995253.jpg,corrupted
2,2,996305,male,female,CA,SF Peninsula,28.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,60000.0,data/tawkify/train/996305.jpg,corrupted
3,3,994861,female,male,CA,SF Peninsula,46.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,150000.0,data/tawkify/train/994861.jpg,corrupted
4,4,996334,male,female,CA,SF City,53.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,500000.0,data/tawkify/train/996334.jpg,corrupted


In [None]:
df.loc[df.userID == 922512].image_type.isna()

### Download images

In [None]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
from time import sleep

a = df.loc[(df.local_photo == '') | df.local_photo.isna()]
print(df.shape)
print(a.shape)

# TEST
a = a[:10000]
chunk_size = 1000
futures = []
# b = a.copy()

In [None]:
def download_images(df):
    downloaded_images = {}
    failed_images = []
#     sleep(10)
#     return {}, []
    for index, row in df.iterrows():
        if df.index.name == 'userID':
            user_id = index
        else:
            user_id = row['userID']
        image_url = row['User_Photo']
        image_path = f'{image_folder}{user_id}.jpg'
        if os.path.exists(image_path):
            downloaded_images[user_id] = image_path
            continue
        try:
            urllib.request.urlretrieve(image_url, image_path)
            downloaded_images[user_id] = image_path
#                 df['local_photo'][index] = image_path
        except Exception as e:
#             print('Could not download image for user:', user_id)
            failed_images.append(user_id)
            continue
    return downloaded_images, failed_images


In [None]:
with ThreadPoolExecutor(5) as executor:
    for chunk in range(chunk_size, a.shape[0], chunk_size):
        idx_end = max(a.shape[0], chunk+chunk_size)
        futures.append(executor.submit(download_images, a[chunk:idx_end]))

    kwargs = {
        'total': len(futures),
        'unit': 'nap',
        'unit_scale': True,
        'leave': True,
        'disable': False
    }
    for x in tqdm(as_completed(futures), **kwargs):
        dl_imgs, failed = x.result()
        update_df = pd.DataFrame({'userID': list(dl_imgs.keys()), 'local_photo': list(dl_imgs.values())})
        if update_df.index.name != 'userID':
            update_df.set_index('userID', inplace=True)
        if df.index.name != 'userID':
            df.set_index('userID', inplace=True)
        df.update(update_df)
        
        failed_df = pd.DataFrame({'userID': list(failed), 'local_photo': 'corrupted', 'image_type': 'corrupted'})
        if failed_df.index.name != 'userID':
            failed_df.set_index('userID', inplace=True)
        df.update(failed_df)
    df.reset_index() # do we need this?

In [20]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [21]:
df.to_csv(PATH+'tw_dem_images.csv')

### Create Image Frame with only JPEGS

In [22]:
img_df = df.loc[(df.local_photo != '') 
                & (df.local_photo != 'corrupted') 
                & (df.image_type == 'jpeg') 
                & ~df.local_photo.isna()
               ]


In [29]:
img_df = pd.read_csv(image_csv)

In [None]:
img_df.shape

In [24]:
img_df.to_csv(image_csv, index=False)

In [None]:
img_df.local_photo.iloc[np.random.randint(10)]

In [None]:
plt.imshow(plt.imread(img_df.local_photo.iloc[np.random.randint(10)]))

### Update dataframe with image size

Unnamed: 0.1,Unnamed: 0,userID,gender,attractedToGender,fromState,metro_name,User_Age,User_Photo,User_Income,local_photo,image_type
0,0,996294,female,male,CA,SF Peninsula,42.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,80000.0,data/tawkify/train/996294.jpg,corrupted
1,1,995253,female,male,CA,SF South Bay,45.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,0.0,data/tawkify/train/995253.jpg,corrupted
2,2,996305,male,female,CA,SF Peninsula,28.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,60000.0,data/tawkify/train/996305.jpg,corrupted
3,3,994861,female,male,CA,SF Peninsula,46.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,150000.0,data/tawkify/train/994861.jpg,corrupted
4,4,996334,male,female,CA,SF City,53.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,500000.0,data/tawkify/train/996334.jpg,corrupted


In [10]:
df['img_w'] = np.nan
df['img_h'] = np.nan
df['img_c'] = np.nan

In [15]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
from time import sleep

a = df.loc[(df.local_photo != '') 
         & (df.local_photo != 'corrupted') 
         & (df.image_type == 'jpeg') 
         & ~df.local_photo.isna()
         & df.img_w.isna()
            ]

print(df.shape)
print(a.shape)

# TEST
a = a[:10000]
chunk_size = 50
# b = a.copy()

(102865, 12)
(89193, 12)


In [18]:
def get_size(df):
    downloaded_images = {}
    failed_images = []
#     sleep(10)
#     return {}, []
    for index, row in df.iterrows():
        if df.index.name == 'userID':
            user_id = index
        else:
            user_id = row['userID']
        image_path = f'{image_folder}{user_id}.jpg'
        img_size = plt.imread(image_path).shape
        if len(img_size) < 3:
            failed_images.append(user_id)
            continue
        downloaded_images[user_id] = img_size
    return downloaded_images, failed_images


In [19]:
futures = []
with ThreadPoolExecutor(8) as executor:
    for chunk in range(chunk_size, a.shape[0], chunk_size):
        idx_end = max(a.shape[0], chunk+chunk_size)
        futures.append(executor.submit(get_size, a[chunk:idx_end].copy()))

    kwargs = {
        'total': len(futures),
        'unit': 'nap',
        'unit_scale': True,
        'leave': True,
        'disable': False
    }
    for x in tqdm(as_completed(futures), **kwargs):
        dl_imgs, failed = x.result()
        img_sizes = list(dl_imgs.values())
        img_w = [x[1] for x in img_sizes]
        img_h = [x[0] for x in img_sizes]
        img_c = [x[2] for x in img_sizes]
        df_data = {'userID': list(dl_imgs.keys()), 
                   'img_c': img_c,
                   'img_w': img_w,
                   'img_h': img_h}
        update_df = pd.DataFrame(df_data)
        if update_df.index.name != 'userID':
            update_df.set_index('userID', inplace=True)
        if df.index.name != 'userID':
            df.set_index('userID', inplace=True)
        df.update(update_df, overwrite=True)
        
#         failed_df = pd.DataFrame({'userID': list(failed), 'local_photo': 'corrupted', 'image_type': 'corrupted'})
#         if failed_df.index.name != 'userID':
#             failed_df.set_index('userID', inplace=True)
#         df.update(failed_df)
    df.reset_index() # do we need this?


  0%|          | 0.00/199 [00:00<?, ?nap/s]

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
def remove_image(idx):
    img = img_df.at[idx, 'local_photo']
    if img == 'corrupted':
        return
    print(img)
    if len(plt.imread(img).shape) >= 3:
        print('Not bad file:', img)
        return
    img_df.at[idx, 'local_photo'] = 'corrupted'
    img_df.at[idx, 'image_type'] = 'corrupted'
    try:
        os.rename(file, file.replace('train', 'corrupted'))
    except Exception as e:
        pass

In [None]:
plt.imread('data/tawkify/train/52426.jpg').shape

In [None]:
plt.hist(w)

In [None]:
w = np.array(w)
plt.hist(w[w<200])

### Check for bad images through image header

In [None]:
# filter_images = df.loc[(df.local_photo == 'data/tawkify/train/922512.jpg')]

In [None]:
filter_images = df.loc[(df.local_photo != '') 
                       & ~df.local_photo.isna() 
                       & (df.image_type.isna() | (df.image_type == ''))]
filter_images.reset_index(inplace=True)

In [None]:
import imghdr
corrupted = []
for idx, row in tqdm(filter_images.iterrows(), total=filter_images.shape[0]):
    file = row.local_photo
    image_type = None
    try:
        image_type = imghdr.what(file)
    except Exception as e:
        image_type = 'corrupted'
    if image_type is None:
        image_type = 'corrupted'
    df.at[idx, 'image_type'] = image_type
    
    if image_type is not 'jpeg':
        try:
            os.rename(file, file.replace('train', 'corrupted'))
            corrupted.append(file)
        except Exception as e:
            # file already removed
            pass
    if idx % 10000 == 0:
        print(f'Found {len(corrupted)} files')

In [None]:
df.loc[df.userID == 922512]

In [None]:
filter_images.head()

In [None]:
file = 'data/tawkify/train/922512.jpg'
print(imghdr.what(file))

## Removing images by warnings

In [None]:

import warnings
warnings.filterwarnings("error")
filter_images = df.loc[(df.image_type == 'jpeg')]
# filter_images.reset_index(inplace=True)

import imghdr
corrupted = []
for idx, row in tqdm(filter_images.iterrows(), total=filter_images.shape[0]):
    file = row.local_photo
    image_type = None
    try:
        a = Image.open(file)
    except (UserWarning, ResourceWarning):
        corrupted.append((idx, file))

In [None]:
corrupted

In [None]:
df.iloc[3291]

### Remove corrupted images

In [None]:
def remove_image(idx, file):
    assert(df.at[idx, 'local_photo'] == file)
    df.at[idx, 'local_photo'] = 'corrupted'
    df.at[idx, 'image_type'] = 'corrupted'
    os.rename(file, file.replace('train', 'corrupted'))

In [None]:
df.at[corrupted[0][0], 'local_photo']

In [None]:

for idx, file in corrupted[1:]:
    remove_image(idx, file)

In [None]:
Image.open('data/tawkify/train/989651.jpg')