In [2]:
import torch

In [3]:

'''
For the fine-tuning of the VGG-Face network for the emotion
recognition task, we investigated various options in our preliminary
analysis. We found that combining weight decay and dropout for regularization
gives the best results on the FER validation set. We carry
out a multi-stage fine-tuning. In the first stage, we fine-tune on the
FER public test set, and run weight updates for five epochs. In the second
stage, we update the upper layers (higher than layer 27) using'''

''' We then fine-tune the VGG-face model on FER 2013
dataset, using both the training and the public test set; during
training we use data augmentation by jittering the scale, flipping
and rotating the faces. The aim is to make the network more robust
to small misalignment of the faces. We also apply a strong dropout
on the last layer of the VGG (keeping only 5% of the nodes) to
prevent over-fitting. We achieve a performance of 71.2% on the
FER private test set, which is slightly higher than the previously
published results '''

'''During the training of the deep networks, we oversample the training
images by rotating them around their center by a random angle between 
−15° and 15°, and by circularly shifting the images in the horizontal and 
vertical directions by an amount no more than 20% of the image size. 
This approach helps our network to be more robust against alignment errors. 
In Fig. 2, we show the training curves of two stages of fine-tuning of the 
network with the FER dataset, where we set the learning rate and weight 
decay to 0.0005, momentum to 0.9, and dropout probability to 0.8 [25].'''


'During the training of the deep networks, we oversample the training\nimages by rotating them around their center by a random angle between \n\xe2\x88\x9215\xc2\xb0 and 15\xc2\xb0, and by circularly shifting the images in the horizontal and \nvertical directions by an amount no more than 20% of the image size. \nThis approach helps our network to be more robust against alignment errors. \nIn Fig. 2, we show the training curves of two stages of fine-tuning of the \nnetwork with the FER dataset, where we set the learning rate and weight \ndecay to 0.0005, momentum to 0.9, and dropout probability to 0.8 [25].'

### VGG16-Face model

In [20]:
import VGG_FACE
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.transforms as transforms

In [21]:
model = VGG_FACE.VGG_FACE

model.load_state_dict(torch.load('VGG_FACE.pth'))

#model.eval() #this will let you pass an input through the model and evaluate it? without training?


### Dataset: FERplus

In [6]:
import os
from skimage import io
import pandas as pd
import numpy as np
import argparse
import torch
from torch.utils.data import Dataset, DataLoader

In [7]:
df = pd.read_csv('all-data/FERPlus/fer2013new.csv')
df.dropna(axis=0, how='any')

Unnamed: 0,Usage,Image name,neutral,happiness,surprise,sadness,anger,disgust,fear,contempt,unknown,NF
0,Training,fer0000000.png,4,0,0,1,3,2,0,0,0,0
1,Training,fer0000001.png,6,0,1,1,0,0,0,0,2,0
2,Training,fer0000002.png,5,0,0,3,1,0,0,0,1,0
3,Training,fer0000003.png,4,0,0,4,1,0,0,0,1,0
4,Training,fer0000004.png,9,0,0,1,0,0,0,0,0,0
5,Training,fer0000005.png,6,0,0,1,0,0,1,1,1,0
6,Training,fer0000006.png,2,0,0,8,0,0,0,0,0,0
7,Training,fer0000007.png,0,10,0,0,0,0,0,0,0,0
8,Training,fer0000008.png,0,10,0,0,0,0,0,0,0,0
9,Training,fer0000009.png,0,0,6,0,0,0,4,0,0,0


In [8]:
class FaceEmotionsDataset(Dataset):

    def __init__(self, csv_file, root_dir, transform = None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.emotions_frame = pd.read_csv(csv_file)
        self.emotions_frame.dropna(axis=0, how='any', inplace=True)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.emotions_frame)

    def __getitem__(self, idx):
        img_name = self.emotions_frame.iloc[idx][1]
        
        img_path = os.path.join(self.root_dir, img_name)
        image = io.imread(img_path)
        #this takes the most highest ranked emotion. if two emotions have the same ranking, it just takes the first one
        emotion = np.argmax(self.emotions_frame.iloc[idx,2:].as_matrix())
        sample = {'image': image, 'emotion': emotion}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample
    
    
    

In [9]:
emotions_frame = pd.read_csv('all-data/FERPlus/fer2013new.csv')
emotions_frame.dropna(axis=0, how='any', inplace=True)

emotion = np.argmax(emotions_frame.iloc[10,2:].as_matrix())
print(emotion)

4


In [10]:
face_emotions = FaceEmotionsDataset(csv_file = 'all-data/FERPlus/fer2013new_training.csv', 
                                    root_dir = 'all-data/FERPlus/data/FER2013Train')

face_emotions[10]

{'emotion': 4, 'image': array([[ 30,  24,  21, ...,  37,  44,  37],
        [ 31,  22,  21, ...,  37,  35,  41],
        [ 27,  22,  19, ...,  33,  34,  40],
        ..., 
        [ 29,  29,  26, ..., 118, 132, 148],
        [ 30,  30,  27, ..., 154, 159, 166],
        [ 32,  29,  28, ..., 172, 173, 173]], dtype=uint8)}

In [12]:
from torchvision.datasets import ImageFolder
from skimage import transform
from PIL import Image
from skimage import io; io.use_plugin('matplotlib')

class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, emotion = sample['image'], sample['emotion']

        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        #print(image.shape)
        #image = image.transpose((2, 0, 1))
        image = np.expand_dims(image,0)
        z = np.zeros((2, 224, 224))
        image = np.concatenate((image,z), axis=0)
        return {'image': torch.from_numpy(image),
                'emotion': emotion}

class RandomCrop(object):
    """Crop randomly the image in a sample.

    Args:
        output_size (tuple or int): Desired output size. If int, square crop
            is made.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size

    def __call__(self, sample):
        image, emotion = sample['image'], sample['emotion']

        h, w = image.shape[:2]
        new_h, new_w = self.output_size

        top = np.random.randint(0, h - new_h)
        left = np.random.randint(0, w - new_w)

        image = image[top: top + new_h,
                      left: left + new_w]

        #emotion = emotion - [left, top]

        return {'image': image, 'emotion': emotion}

class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or tuple): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        image, emotion = sample['image'], sample['emotion']

        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        img = transform.resize(image, (new_h, new_w))

        # h and w are swapped for landmarks because for images,
        # x and y axes are axis 1 and 0 respectively
        
        #emotion = emotion * [new_w / w, new_h / h]

        return {'image': img, 'emotion': emotion}


imgTransform = transforms.Compose([Rescale(256), #scale to 256x256
                                   #transforms.CenterCrop(224), #crops the image at center to 224x224
                                   RandomCrop(224),
                                   ToTensor()
                                   ])
                                   #, #turn the jpg/pil/wahtever image into a tensor
                                   #transforms.Normalize(mean = [0.485, 0.456, 0.406], #normalize with these vals
                                                        #std=[0.229, 0.224, 0.225])])
                                    ##HOW TO GET NORMALIZED VALUES?
                                    #to add: jitter/rotate data augmentation, flipping, 
                                   
#this doesn't work because the data is organized w a csv file w prob distrib of labels 
#instead of a single ground truth
#see this paper: https://arxiv.org/pdf/1608.01041.pdf
trainset = FaceEmotionsDataset(csv_file = 'all-data/FERPlus/fer2013new_training.csv', 
                                    root_dir = 'all-data/FERPlus/data/FER2013Train', transform = imgTransform)
valset = FaceEmotionsDataset(csv_file = 'all-data/FERPlus/fer2013new_validation.csv', 
                                    root_dir = 'all-data/FERPlus/data/FER2013Valid', transform = imgTransform)

trainLoader = torch.utils.data.DataLoader(trainset, batch_size = 64, 
                                          shuffle = True, num_workers = 0)
valLoader = torch.utils.data.DataLoader(valset, batch_size = 64, 
                                       shuffle = False, num_workers = 0)


In [14]:
for (i, sample) in enumerate(trainLoader):
    inputs = sample['image']
    labels = sample['emotion']
    print("{}, {}".format(inputs, labels))
    break

  warn("The default mode, 'constant', will be changed to 'reflect' in "



( 0 , 0 ,.,.) = 
  0.1121  0.1171  0.1220  ...   0.0394  0.0443  0.0499
  0.1149  0.1187  0.1226  ...   0.0324  0.0365  0.0414
  0.1176  0.1204  0.1231  ...   0.0253  0.0288  0.0330
           ...             ⋱             ...          
  0.1305  0.1248  0.1191  ...   0.0369  0.0358  0.0352
  0.1317  0.1259  0.1201  ...   0.0373  0.0360  0.0353
  0.1320  0.1261  0.1202  ...   0.0382  0.0368  0.0360

( 0 , 1 ,.,.) = 
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 0 , 2 ,.,.) = 
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
           

### Function to train the model

In [15]:
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm

In [18]:
t = tqdm(trainLoader)
for (i, (sample)) in enumerate(t):
    break
    inputs = Variable(sample['image'] ).float()
    labels = Variable(sample['emotion'])
    #outputs = model(inputs)




In [19]:
def train_model(network, criterion, optimizer, trainLoader, valLoader, n_epochs = 10, use_gpu = False):
    if use_gpu:
        network = network.cuda()
        criterion = criterion.cuda()
        
    t_loss, t_acc, v_loss, v_acc = (np.zeros(n_epochs) for i in range(4))    
    
    # Training loop.
    for epoch in range(0, n_epochs):
        correct = 0.0
        cum_loss = 0.0
        counter = 0

        # Make a pass over the training data.
        t = tqdm(trainLoader, desc = 'Training epoch %d' % epoch)
        network.train()  # This is important to call before training!
        for (i, (sample)) in enumerate(t):

            # Wrap inputs, and targets into torch.autograd.Variable types.
            inputs = Variable(sample['image'] ).float()
            labels = Variable(sample['emotion'])
            
            if use_gpu:
                inputs = inputs.cuda()
                labels = labels.cuda()

            # Forward pass:
            outputs = network(inputs)
            loss = criterion(outputs, labels)

            # Backward pass:
            optimizer.zero_grad()
            # Loss is a variable, and calling backward on a Variable will
            # compute all the gradients that lead to that Variable taking on its
            # current value.
            loss.backward() 

            # Weight and bias updates.
            optimizer.step()

            # logging information.
            cum_loss += loss.data[0]
            max_scores, max_labels = outputs.data.max(1)
            correct += (max_labels == labels.data).sum()
            counter += inputs.size(0)
            t.set_postfix(loss = cum_loss / (1 + i), accuracy = 100 * correct / counter)
            
        t_loss[epoch] = (cum_loss/len(t))
        t_acc[epoch] = (100*correct/counter)

        # Make a pass over the validation data.
        correct = 0.0
        cum_loss = 0.0
        counter = 0
        t = tqdm(valLoader, desc = 'Validation epoch %d' % epoch)
        network.eval()  # This is important to call before evaluating!
        for (i, (inputs, labels)) in enumerate(t):
            print("on iter {}".format(i))
            # Wrap inputs, and targets into torch.autograd.Variable types.
            inputs = Variable(inputs).unsqueeze(0)
            labels = Variable(labels)
            
            if use_gpu:
                inputs = inputs.cuda()
                labels = labels.cuda()

            # Forward pass:
            outputs = network(inputs)
            loss = criterion(outputs, labels)

            # logging information.
            cum_loss += loss.data[0]
            max_scores, max_labels = outputs.data.max(1)
            correct += (max_labels == labels.data).sum()
            counter += inputs.size(0)
            t.set_postfix(loss = cum_loss / (1 + i), accuracy = 100 * correct / counter)
            
        v_loss[epoch] = (cum_loss/len(t))
        v_acc[epoch] = (100*correct/counter)
        
                
    lab_utils.generate_plots(t_loss, v_loss, t_acc, v_acc, n_epochs)
            

### set learning rate, loss, optimizer, all variable stuff

In [91]:
print(model[38])
#commented out line here throws an error "'Sequential' object does not support item assignment"
#model[38] = nn.Sequential(VGG_FACE.Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(4096,7))

model._modules[38] = nn.Sequential(VGG_FACE.Lambda(lambda x: x.view(1,-1) if 1==len(x.size()) else x ),nn.Linear(4096,7))
print(model._modules[38])
#print(model._modules)
print(model._modules[38].parameters)
print(model[38].parameters)
### WHY THE FUCK ARE MODEL[38] AND MODEL._MODULES[38] DIFFERENT I DON'T UNDERSTAND PYTORCH ###
#https://github.com/pytorch/vision/pull/175/files
#https://github.com/pytorch/vision/issues/173#issuecomment-305737852

Sequential (
  (0): Lambda (
  )
  (1): Linear (4096 -> 2622)
)
Sequential (
  (0): Lambda (
  )
  (1): Linear (4096 -> 7)
)
<bound method Sequential.parameters of Sequential (
  (0): Lambda (
  )
  (1): Linear (4096 -> 7)
)>
<bound method Sequential.parameters of Sequential (
  (0): Lambda (
  )
  (1): Linear (4096 -> 2622)
)>


In [40]:

import torch.optim as optim


#"where we set the learning rate and weight decay to 0.0005, momentum to 0.9, and dropout probability to 0.8 [25]"
learningRate = 5e-4



# Definition of our network.
#how to change the last fc layer of the model to nn.linear(4096, 7) instead of (4096, 2622)?
#model.fc = nn.Linear(512, 2)


#Definition of our loss. #maybe need to change this?
criterion = nn.CrossEntropyLoss()

# Definition of optimization strategy. # maybe need to change this?
optimizer = optim.SGD(model.parameters(), lr = learningRate)

train_model(model, criterion, optimizer, trainLoader, valLoader, n_epochs = 1, use_gpu = False)

KeyboardInterrupt: 