In [2]:
%matplotlib inline
from pathlib import Path

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from torchvision import datasets, models, transforms
from torch.optim import lr_scheduler
import PIL

from fastai import *
from fastai.vision import *
import fastai; fastai.__version__

import torch
import torch.nn as nn
import torch.optim as optim
import time
import copy

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
pdata = Path('D://hung//UTKFace//')
fns = list(pdata.glob('*.jpg'))
fns = [fn for fn in fns if len(str(fn).split('/')[-1].split('_'))==4 and '__' not in str(fn)]

In [5]:
 fns = [fn for fn in fns if len(str(fn).split('/')[-1].split('_'))==4 and '__' not in str(fn)]

In [6]:
def show_dct(dct, n=3):
    return dict(list(dct.items())[:n])

In [7]:
i2fn = fns
fn2i = {fn:i for i,fn in enumerate(i2fn)}
show_dct(fn2i)
bs_fns = [os.path.basename(o) for o in fns]
bs_fns[:3]

['100_0_0_20170112213500903.jpg.chip.jpg',
 '100_0_0_20170112215240346.jpg.chip.jpg',
 '100_1_0_20170110183726390.jpg.chip.jpg']

In [8]:
i2age, i2gender, i2race = zip(*[bs_fn.split('_')[:3] for bs_fn in bs_fns])

In [9]:
i2age = [int(o) for o in i2age]
o2gender = {'0': 'm', '1': 'f'}
i2gender = [o2gender[o] for o in i2gender]
o2race = dict(list(enumerate(('White', 'Black', 'Asian', 'Indian', 'Others'))))
i2race = [o2race[int(o)] for o in i2race]
sorted(Counter(list(zip(i2race, i2gender))). items())

[(('Asian', 'f'), 1859),
 (('Asian', 'm'), 1575),
 (('Black', 'f'), 2208),
 (('Black', 'm'), 2318),
 (('Indian', 'f'), 1714),
 (('Indian', 'm'), 2261),
 (('Others', 'f'), 932),
 (('Others', 'm'), 760),
 (('White', 'f'), 4601),
 (('White', 'm'), 5477)]

In [10]:
df = pd.DataFrame({'img_name': i2fn, 'age': i2age, 'gender': i2gender, 'race': i2race})
# df.to_csv('img2targets.csv', index=False)

In [40]:
df['is_train'] = np.random.choice(3, size=len(df), p=[0.1, 0.1, 0.8])
df.head()

Unnamed: 0,img_name,age,gender,race,is_train
0,D:\hung\UTKFace\100_0_0_20170112213500903.jpg....,100,m,White,2
1,D:\hung\UTKFace\100_0_0_20170112215240346.jpg....,100,m,White,2
2,D:\hung\UTKFace\100_1_0_20170110183726390.jpg....,100,f,White,2
3,D:\hung\UTKFace\100_1_0_20170112213001988.jpg....,100,f,White,1
4,D:\hung\UTKFace\100_1_0_20170112213303693.jpg....,100,f,White,2


In [41]:
# Data augmentation and normalization for training
# Just normalization for validation
d_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])


In [42]:
class MultitaskDataset(torch.utils.data.dataset.Dataset):
    "`Dataset` for joint single and multi-label image classification."
    def __init__(self, 
                 fns,
                 labels_gender,
                 labels_race,
                 classes_gender, 
                 classes_race, 
                 transform):
        
        self.x = np.array(fns)         

        self.classes_gender = classes_gender
        self.class2idx_gender = {v:k for k,v in enumerate(self.classes_gender)}
        self.y_gender = np.array([self.class2idx_gender[o] for o in labels_gender], dtype=np.int64)
        self.c_gender = len(classes_gender)
        
        self.classes_race = classes_race
        self.class2idx_race = {v:k for k,v in enumerate(self.classes_race)}
        self.y_race = np.array([self.class2idx_race[o] for o in labels_race], dtype=np.int64)
        self.c_race = len(classes_race)

        self.transform = transform
    
    def __len__(self): return len(self.x)
    
    def __getitem__(self,i:int):
        return self.transform(PIL.Image.open(self.x[i])), torch.tensor([self.y_gender[i], self.y_race[i]])
    
    def __repr__(self): return f'{type(self).__name__} of len {len(self)}'

In [43]:
def get_dataset(df):
    return MultitaskDataset(df.img_name, 
                            labels_gender=df.gender, 
                            labels_race=df.race, 
                            classes_gender=sorted(set(df.gender)), 
                            classes_race=sorted(set(df.race)), 
                            transform = d_transforms)

In [44]:
N_g=len(set(df.gender)) 
N_r=len(set(df.race))

In [45]:
train_ds = get_dataset(df[df.is_train==1])
valid_ds = get_dataset(df[df.is_train==0])

In [46]:
dataset_sizes = {'train': len(train_ds), 'val': len(valid_ds)}

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=8,shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=8,shuffle=True)

dataloaders={'train': train_loader, 'val': valid_loader}
dataset_sizes

{'train': 2340, 'val': 2380}

In [47]:
def singletask_loss(input, target):
    target_t = target[:,0]
    loss = F.cross_entropy(input, target_t.long())
    return loss

In [48]:
def multitask_loss(input, target):
    input_g = input[:,:N_g]
    input_r = input[:,-N_r:]
    target_g = target[:,0]
    target_r = target[:,1]

    loss_g = torch.nn.functional.cross_entropy(input_g, target_g.long())
    loss_r = torch.nn.functional.cross_entropy(input_r, target_r.long())
    loss = 0.5*loss_g + 0.5*loss_r

    return loss


## Model

In [54]:
N_gender = len(set(df.gender))

model_ft = models.resnet18(pretrained=True)
# for param in model_ft.parameters():
#     param.requires_grad = False

num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, N_g + N_r)
model_ft = model_ft.to(device)
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
criterion = nn.CrossEntropyLoss()

In [55]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc_g = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        model.train()  # Set model to training mode
        running_loss = 0.0
        running_corrects_g = 0
        running_corrects_r = 0

        # Iterate over data.
        for iter, (inputs, labels) in enumerate(dataloaders['train']):
            # print(iter)  
            inputs = inputs.to(device)
            labels = labels.to(device)
            # zero the parameter gradients
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = multitask_loss(outputs, labels)
            loss.backward()
            optimizer.step()            
            running_loss += loss.item() * inputs.size(0)

            _, preds_g = torch.max(outputs[:,:N_g], 1)
            _, preds_r = torch.max(outputs[:,-N_r:], 1)
            running_corrects_g += torch.sum(preds_g == labels[:,0].data)
            running_corrects_r += torch.sum(preds_r == labels[:,1].data)

        scheduler.step()
        epoch_loss = running_loss / dataset_sizes['train']
        epoch_acc_g = running_corrects_g.double() / dataset_sizes['train']
        epoch_acc_r = running_corrects_r.double() / dataset_sizes['train']

        print('{} Loss: {:.4f} Acc_gender: {:.4f} Acc_race: {:.4f}'.format('train', epoch_loss, epoch_acc_g, epoch_acc_r))


        # Each epoch has a training and validation phase
        model.eval()   # Set model to evaluate mode
        running_loss = 0.0
        running_corrects_g = 0
        running_corrects_r = 0

        # Iterate over data.        
        for (inputs, labels) in dataloaders['val']:  
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            # loss = criterion(outputs, labels)
            loss = multitask_loss(outputs, labels)
            # statistics
            running_loss += loss.item() * inputs.size(0)
            _, preds_g = torch.max(outputs[:,:N_g], 1)
            _, preds_r = torch.max(outputs[:,-N_r:], 1)
            running_corrects_g += torch.sum(preds_g == labels[:,0].data)
            running_corrects_r += torch.sum(preds_r == labels[:,1].data)

        epoch_loss = running_loss / dataset_sizes['val']
        epoch_acc_g = running_corrects_g.double() / dataset_sizes['val']
        epoch_acc_r = running_corrects_r.double() / dataset_sizes['val']

        print('{} Loss: {:.4f} Acc_gender: {:.4f} Acc_race: {:.4f}'.format('val', epoch_loss, epoch_acc_g, epoch_acc_r))

        if epoch_acc_g > best_acc_g:
            best_acc_g = epoch_acc_g
            best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc_g))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model



In [56]:
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=20)

Epoch 0/19
----------
train Loss: 0.8703 Acc_gender: 0.7338 Acc_race: 0.5355
val Loss: 0.6776 Acc_gender: 0.8408 Acc_race: 0.6273

Epoch 1/19
----------
train Loss: 0.6026 Acc_gender: 0.8363 Acc_race: 0.6957
val Loss: 0.5491 Acc_gender: 0.8618 Acc_race: 0.7256

Epoch 2/19
----------
train Loss: 0.4505 Acc_gender: 0.8803 Acc_race: 0.7786
val Loss: 0.5461 Acc_gender: 0.8714 Acc_race: 0.7294

Epoch 3/19
----------
train Loss: 0.3452 Acc_gender: 0.8991 Acc_race: 0.8453
val Loss: 0.5749 Acc_gender: 0.8655 Acc_race: 0.7277

Epoch 4/19
----------
train Loss: 0.2525 Acc_gender: 0.9282 Acc_race: 0.8923
val Loss: 0.5592 Acc_gender: 0.8710 Acc_race: 0.7441

Epoch 5/19
----------
train Loss: 0.1730 Acc_gender: 0.9620 Acc_race: 0.9265
val Loss: 0.6359 Acc_gender: 0.8748 Acc_race: 0.6996

Epoch 6/19
----------
train Loss: 0.1418 Acc_gender: 0.9603 Acc_race: 0.9440
val Loss: 0.6079 Acc_gender: 0.8706 Acc_race: 0.7399

Epoch 7/19
----------
train Loss: 0.0910 Acc_gender: 0.9697 Acc_race: 0.9761
val Lo

In [52]:

dataset_sizes['val']

2380