In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# This file contains all the main external libs we'll use
from fastai.imports import *

In [3]:
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

import urllib
from PIL import Image

In [4]:
PATH = 'data/tawkify/'
image_folder = f'{PATH}train/'
data_csv = PATH+'tw_dem_images.csv'
image_csv = PATH+'image_csv.csv'

In [5]:
sz=220
# sz=400

### Load Data

In [None]:
# df = pd.read_csv(PATH+'Photo_Income_Age_2.8.18.csv')

In [6]:
df = pd.read_csv(data_csv, dtype={'local_photo': 'object', 'image_type': 'object'})

In [None]:
df.columns

In [None]:
df['userID'].size

In [None]:
# df.drop('Unnamed: 0.1', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.loc[df.userID == 922512].image_type.isna()

### In case we need to delete bad images from the dataframe

In [None]:
df.loc[df.local_photo == 'data/tawkify/train/921661.jpg']

In [None]:
df.at[42785, 'local_photo'] = 'corrupted'
df.at[42785, 'image_type'] = 'corrupted'

### Check for bad images instead

In [None]:
df.loc[df.userID == 937505]

In [None]:
# If you want to replace nan files in local_photo
# df.local_photo.replace(np.nan, '', inplace=True)

In [None]:
# img_df = df.loc[(df.local_photo == 'data/tawkify/train/922512.jpg')]

In [None]:
img_df = df.loc[(df.local_photo != '') & ~df.local_photo.isna()]

In [None]:
import imghdr
for idx, row in img_df.iterrows():
    file = row.local_photo
    image_type = None
    try:
        image_type = imghdr.what(file)
    except Exception as e:
        image_type = 'corrupted'
    if image_type is None:
        image_type = 'corrupted'
    df.at[idx, 'image_type'] = image_type
    
    if image_type is not 'jpeg':
        try:
            os.rename(file, file.replace('train', 'corrupted'))
        except Exception as e:
            # file already removed
            pass

In [None]:
df.loc[df.userID == 922512]

In [None]:
img_df.head()

In [None]:
file = 'data/tawkify/train/922512.jpg'
print(imghdr.what(file))

### Download images

In [None]:
df.userID[0]

In [None]:
df.shape

In [None]:
# OLD SYNCHRONOUS WAY
# for index, row in tqdm(df.iterrows(), total=df.shape[0]):
#     user_id = row['userID']
#     image_url = row['User_Photo']
#     image_path = f'{image_folder}{user_id}.jpg'
#     if not os.path.exists(image_path):
#         try:
#             urllib.request.urlretrieve(image_url, image_path)
#             df['local_photo'][index] = image_path
#         except Exception as e:
# #             print('Could not download image for user:', user_id)
#             pass
#     else:
#         df['local_photo'][index] = image_path

In [10]:
df.head()

Unnamed: 0_level_0,gender,attractedToGender,fromState,metro_name,User_Age,User_Photo,User_Income,local_photo,image_type
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
996294,female,male,CA,SF Peninsula,42.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,80000.0,data/tawkify/train/996294.jpg,corrupted
995253,female,male,CA,SF South Bay,45.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,0.0,data/tawkify/train/995253.jpg,corrupted
996305,male,female,CA,SF Peninsula,28.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,60000.0,data/tawkify/train/996305.jpg,corrupted
994861,female,male,CA,SF Peninsula,46.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,150000.0,data/tawkify/train/994861.jpg,corrupted
996334,male,female,CA,SF City,53.0,https://s3-us-west-1.amazonaws.com/tawkifyfile...,500000.0,data/tawkify/train/996334.jpg,corrupted


In [None]:
# df.drop('Unnamed: 0', 1, inplace=True)

In [22]:
df.to_csv(PATH+'tw_dem_images.csv')

In [18]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
from time import sleep

a = df.loc[(df.local_photo == '') | df.local_photo.isna()]

# TEST
a = a[:10000]
chunk_size = 1000
futures = []
# b = a.copy()

In [19]:
def download_images(df):
    downloaded_images = {}
    failed_images = []
#     sleep(10)
#     return {}, []
    for index, row in df.iterrows():
        if df.index.name == 'userID':
            user_id = index
        else:
            user_id = row['userID']
        image_url = row['User_Photo']
        image_path = f'{image_folder}{user_id}.jpg'
        if os.path.exists(image_path):
            downloaded_images[user_id] = image_path
            continue
        try:
            urllib.request.urlretrieve(image_url, image_path)
            downloaded_images[user_id] = image_path
#                 df['local_photo'][index] = image_path
        except Exception as e:
#             print('Could not download image for user:', user_id)
            failed_images.append(user_id)
            continue
    return downloaded_images, failed_images
    
# urllib.request.urlretrieve('https://www.blog.google/static/blog/images/google-200x200.7714256da16f.png', f'{PATH}00000001.jpg')

In [20]:
with ThreadPoolExecutor(5) as executor:
    for chunk in range(chunk_size, a.shape[0], chunk_size):
        idx_end = max(a.shape[0], chunk+chunk_size)
        futures.append(executor.submit(download_images, a[chunk:idx_end]))

    kwargs = {
        'total': len(futures),
        'unit': 'nap',
        'unit_scale': True,
        'leave': True,
        'disable': False
    }
    for x in tqdm(as_completed(futures), **kwargs):
        dl_imgs, failed = x.result()
        update_df = pd.DataFrame({'userID': list(dl_imgs.keys()), 'local_photo': list(dl_imgs.values())})
        if update_df.index.name != 'userID':
            update_df.set_index('userID', inplace=True)
        if df.index.name != 'userID':
            df.set_index('userID', inplace=True)
        df.update(update_df)
        
        failed_df = pd.DataFrame({'userID': list(failed), 'local_photo': 'corrupted', 'image_type': 'corrupted'})
        if failed_df.index.name != 'userID':
            failed_df.set_index('userID', inplace=True)
        df.update(failed_df)
    df.reset_index() # do we need this?

100%|██████████| 9.00/9.00 [07:59<00:00, 53.3s/nap] 


In [None]:
# test
def wait_a(x):
    sleep(2)
    return x

futures = []
with ThreadPoolExecutor(5) as executor:
    
    for chunk in range(10):
        futures.append(executor.submit(wait_a, chunk))

    kwargs = {
        'total': len(futures),
        'unit': 'nap',
        'unit_scale': True,
        'leave': True,
        'disable': False
    }
    for x in tqdm(as_completed(futures), **kwargs):
        print('Finished:', x.result())

In [None]:
b.head()

In [None]:
a[chunk:idx_end]

### Read images

In [None]:
img_df = df.loc[(df.local_photo != '') 
                & (df.local_photo != 'corrupted') 
                & (df.image_type == 'jpeg') 
                & ~df.local_photo.isna()
               ]


In [None]:
img_df = pd.read_csv(image_csv)

In [None]:
img_df.to_csv(image_csv)

In [None]:
img_df.local_photo.iloc[np.random.randint(10)]

In [None]:
plt.imshow(plt.imread(img_df.local_photo.iloc[np.random.randint(10)]))

In [None]:
size_map = { idx:plt.imread(k.local_photo).shape for idx,k in img_df.iterrows()}

In [None]:
size_map

In [None]:
h, w, ch = list(zip(*size_map.values()))

In [None]:
plt.hist(w)

In [None]:
w = np.array(w)
plt.hist(w[w<600])

### Create training csv

In [23]:
model_csv = f'{PATH}current_model_age.csv'
# model_csv = f'{PATH}sample_model.csv'

In [None]:
img_df.head()

In [None]:
temp_df = img_df.loc[~img_df.User_Age.isna() & (img_df.User_Age < 70) & (img_df.User_Age > 20)]
temp_df = pd.DataFrame(data={'id': temp_df.userID, 'age': temp_df.User_Age}, columns=['id', 'age'])
temp_df.to_csv(model_csv, index=False)

In [None]:
temp_df.sort_values('age')

### Architecture

In [None]:
# temp_df.sort_values('id').head()

In [None]:
# !rm -rf {PATH}tmp

In [31]:
temp_df = pd.read_csv(model_csv)

In [None]:
temp_df.age.mean()

In [25]:
arch=resnet50

In [26]:
arch()

ResNet(
  (conv1): Conv2d (3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d (64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (conv2): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (conv3): Conv2d (64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
      (relu): ReLU(inplace)
      (downsample): Sequential(
        (0): Conv2d (64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
      )
    )
    (1):

In [27]:
??tfms_from_model

In [28]:
??ConvLearner

In [32]:
tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
n = temp_df.shape[0]-1
val_idxs = get_cv_idxs(n)
# val_idxs = list(temp_df.iloc[val_idxs].id)
data = ImageClassifierData.from_csv(path=PATH, folder='train', csv_fname=model_csv, tfms=tfms, suffix='.jpg', val_idxs=val_idxs, continuous=True)
# data = ImageClassifierData.from_csv(path=PATH, folder='train', csv_fname=image_csv, test_name='test', tfms=tfms, suffix='.jpg', val_idxs=val_idxs)
# new_data = data.resize(sz, 'tmp')

In [None]:
# fnames,y,classes = csv_source('train', model_csv, True, '.jpg', continuous=False)

In [None]:
data.trn_ds[0][0].shape

In [33]:
learn = ConvLearner.pretrained(arch, data=data, precompute=True)

 73%|███████▎  | 326/448 [01:16<00:28,  4.24it/s]

KeyboardInterrupt: 

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
learn.fit(lrs=1e-2, n_cycle=2, )

In [None]:
learn.precompute=False

In [None]:
learn.fit(lrs=1e-2, n_cycle=3)

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
learn.fit(lrs=[1e-4,1e-3,1e-2], n_cycle=3, cycle_len=1)

In [None]:
learn.save('tw_age_unfreeze1')

In [None]:
learn.load('tw_age_unfreeze1')

In [None]:
learn.fit(lrs=[1e-4,1e-3,1e-2], n_cycle=3, cycle_mult=2)

In [None]:
log_preds, y = learn.TTA()
probs = np.exp(log_preds)
accuracy(log_preds,y), metrics.log_loss(y, probs)

In [None]:
preds = np.argmax(log_preds, axis=1)
# np.array(preds==y).mean()
preds
# accuracy(log_preds,y)

### Test image

In [None]:
test_image_url = 'https://images-na.ssl-images-amazon.com/images/M/MV5BMTk4MDM0MDUzM15BMl5BanBnXkFtZTcwOTI4MzU1Mw@@._V1_UY317_CR7,0,214,317_AL_.jpg'

In [None]:
# test_image_file = f'{PATH}test_images/test1.jpg'
# test_image_file = f'{PATH}test_images/kenneth_s.jpg'
# test_image_file = f'{PATH}test_images/age_img/IMG_2776.jpg'
# test_image_file = f'{PATH}test_images/age_img/IMG_8274.jpg'
# test_image_file = f'{PATH}test_images/age_img/IMG_3135.jpg'
# test_image_file = f'{PATH}test_images/age_img/IMG_8819.jpg'
# test_image_file = f'{PATH}test_images/age_img/IMG_9370.jpg'
# test_image_file = f'{PATH}test_images/age_img/IMG_2793.jpg'
# test_image_file = f'{PATH}test_images/grandma_test1.jpg'
# test_image_file = f'{PATH}test_images/lzl_test1.jpg'
# test_image_file = f'{PATH}test_images/age_img/IMG_9625.jpg'

test_image_file = f'{PATH}test_images/jackie_test1.jpg'

In [None]:
urllib.request.urlretrieve(test_image_url, test_image_file)

In [None]:
plt.imshow(plt.imread(test_image_file))

In [None]:
val_tfms = tfms[1]

In [None]:
# test_image = val_tfms(plt.imread(test_image_file))
test_image = val_tfms(open_image(test_image_file))


In [None]:
test_image.shape

In [None]:
# learn.TTA()

In [None]:
age = learn.predict_array([test_image])[0]

In [None]:
print(f'Age:', age[0])

In [None]:
data.classes