In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# This file contains all the main external libs we'll use
from fastai.imports import *

In [None]:
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

import urllib

In [None]:
PATH = 'data/tawkify/'
image_folder = f'{PATH}train/'
data_csv = PATH+'tw_dem_images.csv'
image_csv = PATH+'image_csv.csv'

In [None]:
sz=220

### Load Data

In [None]:
# df = pd.read_csv(PATH+'Photo_Income_Age_2.8.18.csv')

In [None]:
df = pd.read_csv(data_csv, dtype={'local_photo': 'object'})

In [None]:
df.columns

In [None]:
df['userID'].size

In [None]:
df.head()

In [None]:
plt.imshow(plt.imread(f'{PATH}train/933e245251bc667ce6ff74f12dc998fe.jpg'))

### Download images

In [None]:
df.userID[0]

In [None]:
df.shape

In [None]:
# OLD SYNCHRONOUS WAY
# for index, row in tqdm(df.iterrows(), total=df.shape[0]):
#     user_id = row['userID']
#     image_url = row['User_Photo']
#     image_path = f'{image_folder}{user_id}.jpg'
#     if not os.path.exists(image_path):
#         try:
#             urllib.request.urlretrieve(image_url, image_path)
#             df['local_photo'][index] = image_path
#         except Exception as e:
# #             print('Could not download image for user:', user_id)
#             pass
#     else:
#         df['local_photo'][index] = image_path

In [None]:
df.head()

In [None]:
df.to_csv(PATH+'tw_dem_images.csv')

In [None]:
plt.imshow(plt.imread(df.local_photo[np.random.randint(10)]))

In [None]:
a = df.loc[df.local_photo == '']

In [None]:
a.shape

In [None]:
from concurrent.futures import ThreadPoolExecutor, wait, as_completed
from time import sleep

a = df.loc[df.local_photo == '']

# TEST
a = a[:10000]
chunk_size = 1000
futures = []
# b = a.copy()

In [None]:
def download_images(df):
    downloaded_images = {}
    failed_images = []
#     sleep(10)
#     return {}, []
    for index, row in df.iterrows():
        if df.index.name == 'userID':
            user_id = index
        else:
            user_id = row['userID']
        image_url = row['User_Photo']
        image_path = f'{image_folder}{user_id}.jpg'
        if os.path.exists(image_path):
            downloaded_images[user_id] = image_path
            continue
        try:
            urllib.request.urlretrieve(image_url, image_path)
            downloaded_images[user_id] = image_path
#                 df['local_photo'][index] = image_path
        except Exception as e:
#             print('Could not download image for user:', user_id)
            failed_images.append(user_id)
            continue
    return downloaded_images, failed_images
    
# urllib.request.urlretrieve('https://www.blog.google/static/blog/images/google-200x200.7714256da16f.png', f'{PATH}00000001.jpg')

In [None]:
with ThreadPoolExecutor(5) as executor:
    for chunk in range(chunk_size, a.shape[0], chunk_size):
        idx_end = max(a.shape[0], chunk+chunk_size)
        futures.append(executor.submit(download_images, a[chunk:idx_end]))

    kwargs = {
        'total': len(futures),
        'unit': 'nap',
        'unit_scale': True,
        'leave': True,
        'disable': False
    }
    for x in tqdm(as_completed(futures), **kwargs):
        dl_imgs, failed = x.result()
    #     print(f'Downloaded: {dl_imgs}, Failed: {failed}')
        new_df = pd.DataFrame({'userID': list(dl_imgs.keys()), 'local_photo': list(dl_imgs.values())})
        if new_df.index.name != 'userID':
            new_df.set_index('userID', inplace=True)
        if df.index.name != 'userID':
            df.set_index('userID', inplace=True)
        df.update(new_df)
#     df.reset_index() # do we need this?

In [None]:
# test
def wait_a(x):
    sleep(2)
    return x

futures = []
with ThreadPoolExecutor(5) as executor:
    
    for chunk in range(10):
        futures.append(executor.submit(wait_a, chunk))

    kwargs = {
        'total': len(futures),
        'unit': 'nap',
        'unit_scale': True,
        'leave': True,
        'disable': False
    }
    for x in tqdm(as_completed(futures), **kwargs):
        print('Finished:', x.result())

In [None]:
b.head()

In [None]:
a[chunk:idx_end]

### Read images

In [None]:
img_df = df.loc[df.local_photo != '']

In [None]:
img_df = pd.read_csv(image_csv)

In [None]:
img_df.to_csv(image_csv)

In [None]:
img_df.local_photo.iloc[np.random.randint(10)]

In [None]:
plt.imshow(plt.imread(img_df.local_photo.iloc[np.random.randint(10)]))

In [None]:
size_map = { idx:plt.imread(k.local_photo).shape for idx,k in img_df.iterrows()}

In [None]:
size_map

In [None]:
h, w, ch = list(zip(*size_map.values()))

In [None]:
plt.hist(w)

In [None]:
w = np.array(w)
plt.hist(w[w<600])

### Architecture

In [None]:
# !rm -rf {PATH}tmp

In [None]:
model_csv = f'{PATH}current_model.csv'

In [None]:
temp_df = pd.read_csv(model_csv)

In [None]:
sample_csv = f'{PATH}sample_model.csv'

In [None]:
temp_df[:100].to_csv(sample_csv, index=False)

In [None]:
temp_df = pd.DataFrame(data={'id': img_df.userID, 'gender': img_df.gender}, columns=['id', 'gender'])
temp_df.to_csv(model_csv, index=False)

In [None]:
arch=resnet50

In [None]:
tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
n = temp_df[:100].shape[0]-1
val_idxs = get_cv_idxs(n)
data = ImageClassifierData.from_csv(path=PATH, folder='train', csv_fname=sample_csv, tfms=tfms, suffix='.jpg', val_idxs=val_idxs)
# data = ImageClassifierData.from_csv(path=PATH, folder='train', csv_fname=image_csv, test_name='test', tfms=tfms, suffix='.jpg', val_idxs=val_idxs)
new_data = data.resize(sz, 'tmp')

In [None]:
data.trn_ds[0][0].shape

In [None]:
learn = ConvLearner.pretrained(arch, data=new_data, precompute=True)

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
learn.fit(lrs=1e-1, n_cycle=2)

In [None]:
learn.fit(lrs=1e-2, n_cycle=2)

In [None]:
learn.precompute=False

In [None]:
learn.fit(lrs=1e-2, n_cycle=3)

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
learn.fit(lrs=[1e-4,1e-3,1e-2], n_cycle=3, cycle_len=1)

In [None]:
learn.fit(lrs=[1e-4,1e-3,1e-2], n_cycle=3, cycle_mult=2)

In [None]:
log_preds, y = learn.TTA()
probs = np.exp(log_preds)
accuracy(log_preds,y), metrics.log_loss(y, probs)

In [None]:
preds = np.argmax(log_preds, axis=1)
# np.array(preds==y).mean()
preds
# accuracy(log_preds,y)