# Notebook details

In [None]:
def setup_notebook(fix_python_path=True, reduce_margins=True, plot_inline=True):
    if reduce_margins:
        # Reduce side margins of the notebook
        from IPython.core.display import display, HTML
        display(HTML("<style>.container { width:100% !important; }</style>"))

    if fix_python_path:
        # add egosocial to the python path
        import os, sys
        sys.path.extend([os.path.dirname(os.path.abspath('.'))])

    if plot_inline:
        # Plots inside cells
        %matplotlib inline
    
    global __file__
    __file__ = 'Notebook'

setup_notebook()

# Imports and Constants Definition

In [None]:
# !/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
from collections import defaultdict
import logging
import matplotlib.pyplot as plt
import os
import sys

import numpy as np
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.utils import compute_class_weight

import keras
from keras import backend as K
from keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from keras.layers import Input, Dense, Dropout
from keras.layers.noise import AlphaDropout
from keras.models import Model
from keras.regularizers import l1, l2
from keras.utils import to_categorical

import egosocial.config
from egosocial.core.attributes import AttributeSelector
from egosocial.core.types import relation_to_domain, relation_to_domain_vec
from egosocial.utils.keras.autolosses import AutoMultiLossWrapper
from egosocial.utils.logging import setup_logging
from egosocial.utils.keras.callbacks import PlotLearning
from egosocial.utils.keras.backend import limit_gpu_allocation_tensorflow
from egosocial.utils.io import load_features

# constants
DOMAIN, RELATION = 'domain', 'relation'
END_TO_END, ATTRIBUTES = 'end_to_end', 'attributes'

N_CLS_RELATION, N_CLS_DOMAIN = 16, 5

# Limit GPU memory allocation with Tensorflow

In [None]:
limit_gpu_allocation_tensorflow(0.25)

# Unused functions

In [None]:
def compute_stats(X, y, clf):
    y_predicted = clf.predict(X)
    acc = sklearn.metrics.accuracy_score(y, y_predicted)
    confusion_matrix = sklearn.metrics.confusion_matrix(y, y_predicted)
    report = sklearn.metrics.classification_report(y, y_predicted)

    return acc, confusion_matrix, report


def print_statistics(val_stats=None, test_stats=None, fdesc=sys.stdout):
    for description, stats in [('Validation set:', val_stats),
                               ('Test set:', test_stats)]:

        if stats is not None:
            print(description, file=fdesc)
            accuracy, confusion_matrix, report = stats
            print('Confusion matrix:', file=fdesc)
            print(confusion_matrix, file=fdesc)
            print(file=fdesc)
            print(report, file=fdesc)
            print('SGD accuracy: {:.3f}'.format(accuracy), file=fdesc)
            print('------------------------------------------------',
                  file=fdesc)

# Input arguments and fake main

In [None]:
class Configuration:
    def __init__(self, args):
        self.DATA_TYPE = RELATION
        self.ARCH = 'caffeNet'
        self.LAYER = 'fc7'

        self.CONFIG = '{}_{}_{}'.format(self.LAYER, self.DATA_TYPE, self.ARCH)

        # setup directories
        self.PROJECT_DIR = args.project_dir
        self.BASE_MODELS_DIR = os.path.join(self.PROJECT_DIR,
                                            'models/trained_models')
        self.ATTR_MODELS_DIR = os.path.join(self.BASE_MODELS_DIR,
                                            'attribute_models')
        self.SVM_MODELS_DIR = os.path.join(self.PROJECT_DIR,
                                           'models/svm_models')

        self.SPLITS_DIR = os.path.join(self.PROJECT_DIR,
                                       'datasets/splits/annotator_consistency3')

        self.STATS_MODELS_DIR = os.path.join(self.SVM_MODELS_DIR, 'stats')

        LABEL_FILE_FMT = 'single_body1_{}_16.txt'
        self.LABEL_FILES = {split: os.path.join(self.SPLITS_DIR,
                                                LABEL_FILE_FMT.format(split))
                            for split in ('train', 'test', 'eval')}

        self.IS_END2END = False

        self.BASE_FEATURES_DIR = os.path.join(self.PROJECT_DIR,
                                              'extracted_features')
        self.FEATURES_DIR = os.path.join(self.BASE_FEATURES_DIR,
                                         'attribute_features',
                                         self.CONFIG)

        self.STORED_FEATURES_DIR = os.path.join(self.FEATURES_DIR,
                                                'all_splits_numpy_format')

        self.PROCESS_FEATURES = args.port_features

        self.EPOCHS = args.epochs
        self.BATCH_SIZE = args.batch_size

        # reuse precomputed model?
        self.REUSE_MODEL = args.reuse_model
        # save model to disk?
        self.SAVE_MODEL = args.save_model
        # save model statistics to disk?
        self.SAVE_STATS = args.save_stats
        
def positive_int(value):
    ivalue = int(value)
    if ivalue <= 0:
        raise argparse.ArgumentTypeError(
            "%s is an invalid positive int value" % value)
    return ivalue

def main(*fake_args):
    setup_logging(egosocial.config.LOGGING_CONFIG)

    entry_msg = 'Reproduce experiments in Social Relation Recognition paper.'
    parser = argparse.ArgumentParser(description=entry_msg)

    parser.add_argument('--project_dir', required=True,
                        help='Base directory.')

    parser.add_argument('--port_features', required=False,
                        action='store_true',
                        help='Whether port features from other formats to'
                             'numpy.')

    parser.add_argument('--reuse_model', required=False,
                        action='store_true',
                        help='Use precomputed model if available.')

    parser.add_argument('--save_model', required=False,
                        action='store_true',
                        help='Save model to disk.')

    parser.add_argument('--save_stats', required=False,
                        action='store_true',
                        help='Save statistics to disk.')

    parser.add_argument('--epochs', required=False, type=positive_int,
                        default=100,
                        help='Max number of epochs.')

    parser.add_argument('--batch_size', required=False, type=positive_int,
                        default=32,
                        help='Batch size.')

    # TODO: implement correctly
    args = parser.parse_args(*fake_args)
    # keep configuration
    conf = Configuration(args)

    return conf

# Helper functions and classes

In [None]:
def domain_to_relation_map():
    W = [np.zeros(N_CLS_RELATION) for _ in range(N_CLS_DOMAIN)]
    for rel in range(N_CLS_RELATION):
        dom = relation_to_domain(rel)
        W[dom] += to_categorical(rel, N_CLS_RELATION)
    return np.array(W).T

def prepare_data_split_for_keras(data_split):
    x_train, x_val, x_test, *labels = data_split
    # one-hot encoding for relation
    y_train_rel, y_val_rel, y_test_rel = [
        to_categorical(y, N_CLS_RELATION) for y in labels
    ]
    # one-hot encoding for domain
    y_train_dom, y_val_dom, y_test_dom = [
        to_categorical(relation_to_domain_vec(y), N_CLS_DOMAIN) for y in labels
    ]

    x_train_inputs = {'attribute_features': x_train}
    y_train_outputs = {'relation': y_train_rel, 'domain': y_train_dom}
    x_val_inputs = {'attribute_features': x_val}
    y_val_outputs = {'relation': y_val_rel, 'domain': y_val_dom}
    x_test_inputs = {'attribute_features': x_test}
    y_test_outputs = {'relation': y_test_rel, 'domain': y_test_dom}
    
    result = dict(train=(x_train_inputs, y_train_outputs),
                  val=(x_val_inputs, y_val_outputs),
                  test=(x_test_inputs, y_test_outputs))
    
    return result

def compute_class_weight_relation_domain(y_train_rel):
    y_train = dict(relation=y_train_rel, domain=relation_to_domain_vec(y_train_rel))

    class_weight = {}
    for y_type, y_data in y_train.items():
        classes = sorted(np.unique(y_data))
        weights = compute_class_weight('balanced', classes, y_data)
        class_weight[y_type] = dict(zip(classes, weights))

    return class_weight

# Model definitions

In [None]:
def create_model_top_down(n_features):
    input_features = Input(shape=[n_features], 
                           name='attribute_features',
                           dtype='float')
    x = input_features
    x = keras.layers.BatchNormalization()(x)
    x = Dropout(0.25)(x)
    
    x = Dense(128, name='dense_1',
                   activation='elu',
                   bias_initializer='lecun_normal',
                   kernel_initializer='lecun_normal',
                   bias_regularizer=l2(0.1),
                   kernel_regularizer=l2(0.1),              
             )(x)
    x = keras.layers.BatchNormalization()(x)
    x = AlphaDropout(0.25)(x)

    domain = Dense(N_CLS_DOMAIN, name='domain',
                   activation='softmax',
                   bias_regularizer=l2(0.1),
                   kernel_regularizer=l2(0.1),
                  )(x)

    x = keras.layers.concatenate([x, domain])
    
    relation = Dense(N_CLS_RELATION, name='relation',
                     activation='softmax',
                     bias_regularizer=l2(0.1),
                     kernel_regularizer=l2(0.1),
                    )(x)

    model = Model(inputs=[input_features], outputs=[domain, relation])

    return model

def create_model_fix_domain(n_features):

    input_features = Input(shape=[n_features], 
                           name='attribute_features',
                           dtype='float',
                           )

    x = input_features
    x = keras.layers.BatchNormalization()(x)
    x = Dropout(0.25)(x)
    
    x = Dense(128, name='dense_1',
                   activation='selu',
                   bias_initializer='lecun_normal',
                   kernel_initializer='lecun_normal',
                   bias_regularizer=l2(0.01),
                   kernel_regularizer=l2(0.01), 
             )(x)
    
    relation = Dense(N_CLS_RELATION, name='relation',
                     activation='softmax',
                     bias_regularizer=l2(0.01),
                     kernel_regularizer=l2(0.01),
                    )(x)    
    
    domain = Dense(N_CLS_DOMAIN, name='domain',
                   activation='linear',
                   use_bias=False, trainable=False,
                   weights=[domain_to_relation_map()],
                  )(relation)
    
    model = Model(inputs=[input_features], outputs=[domain, relation])

    return model

# Main Class

In [None]:
class SocialClassifierWithPreComputedFeatures:
    
    def __init__(self, features_dir, label_files):
        self._features_dir = features_dir
        self._label_files = label_files

        self._log = logging.getLogger(self.__class__.__name__)

        # parameters
        # enables dimentionality reduction
        self._dim_reduction = True
        # force recomputing PCA every time
        self._refit_pca = False

        # if any of these parameters change, PCA should be recomputed
        # features quantization (smaller Q promotes sparsity)
        self._quantization = False
        self._Q = 32
        # parameters for PCA search
        self._min_dim = 50 # min number of components
        self._max_dim = 200 # max number of components
        self._min_expl_var = 0.8 # min desired explained variance
        self._max_pca_retries = 3 # max number of retries
        # if parameter is set in (0,1] 
        # full PCA is fitted  and keep number of components for the 
        # required expl. var; otherwise it performs search
        self._pca_conf = self._min_dim

        # cache PCA instances
        self._precomputed_pca = {}
        # keep features
        self._attribute_features = None
        # keep labels
        self._labels = None
        # initialize when data split is configured
        self._n_features = None
        # initialize when model is configured
        self._model_wrapper = None
        self.model = None

    def load_data(self):
        self._attribute_features = load_features(self._features_dir, self._parse_filename)
        self._labels = self._load_labels(self._label_files)

        attributes = self.list_attributes() # needs _attribute_features already set
        self._log.info('Found {} attributes. List: '.format(len(attributes)))
        for attr in attributes:
            self._log.info('{}'.format(attr))

        # reset internal fields
        self._n_features = None
        self.model = None
        # pca gets reset only if refit_pca is enabled 
        if self._refit_pca:
            self._precomputed_pca = {}

    def list_attributes(self):
        # list attributes
        if self._attribute_features:
            return sorted(self._attribute_features['train'].keys())
        else:
            return []

    def get_data_split(self, selected_attributes, preprocess=True):
        # get data splits composed by selected attributes only
        # preprocess the data

        # splits (switch from caffe's split name convention to keras's convention)
        _train, _val, _test = 'train', 'test', 'eval'
        attribute_features = self._attribute_features
        labels = self._labels

        # assert attributes
        assert attribute_features
        assert labels        

        if preprocess:
            features = defaultdict(dict)
            # preprocess each selected attribute individually
            for attr in selected_attributes:
                data = self._preprocess_data(attribute_features[_train][attr]
                                            , attribute_features[_val][attr]
                                            , attribute_features[_test][attr]
                                            , data_id=attr)
                for split_idx, split in enumerate((_train, _val, _test)):
                    features[split][attr] = data[split_idx]
        else:
            features = attribute_features

        # concatenate attributes
        fused_features = {}
        for split in (_train, _val, _test):
            selected_features = [features[split][attr]
                                 for attr in selected_attributes]

            fused_features[split] = np.concatenate(selected_features, axis=1)

        result = [fused_features[split] for split in (_train, _val, _test)]
        result.extend([labels[split] for split in (_train, _val, _test)])

        # init number of dimensions
        self._n_features = fused_features[_train].shape[1]

        return tuple(result)


    def prepare_data(self, data_split):
        return prepare_data_split_for_keras(data_split)

    def init_model(self, model_type='top_down'):
        assert model_type in ('top_down', 'fix_domain') 
        assert self._n_features is not None        
        if model_type == 'top_down':
            model = create_model_top_down(self._n_features)
        else:
            model = create_model_fix_domain(self._n_features)
        self._log.info('Initializing {} model'.format(model_type))
        # compile model with default values
        # predifined metrics: domain and relations accuracies
        self.set_custom_model(model, metrics=['accuracy'])

    def set_custom_model(self, model, 
                         optimizer='adam', 
                         loss='categorical_crossentropy', 
                         loss_weights='auto',
                         **kwargs):
        assert self._n_features is not None
        # check number of features
        assert len(model.inputs[0].shape) == 2
        assert self._n_features == model.inputs[0].shape[1]

        # wrapper allows to train the loss weights
        self._model_wrapper = AutoMultiLossWrapper(model)
        self._model_wrapper.compile(optimizer=optimizer, loss=loss, 
                                    loss_weights=loss_weights, **kwargs)

        self.model = self._model_wrapper.model
        self._log.info(self.model.summary())

    def fit(self, train_data, validation_data, **kwargs):
        self._log.info("Training model from scratch...")
        # validation data becomes mandatory
        return self.model.fit(*train_data,
                              validation_data=validation_data,
                              **kwargs)

    def evaluate(self, test_data, **kwargs):
        return self.model.evaluate(*test_data, **kwargs)

    def _parse_filename(self, numpy_file):
        # split the extension from the path and normalize it to lowercase.
        filename, ext = os.path.splitext(numpy_file)
        ext = ext.lower()

        # extract attribute name and split information
        attr_name, split = filename.rsplit('_', 1)
        # some attributes are splitted in two files (one for each person)
        # create a list unique attributes name
        if attr_name.endswith('_1') or attr_name.endswith('_2'):
            attr_name = attr_name[:-2]

        return split, attr_name, ext

    def _load_labels(self, label_files):
        # splits (switch from caffe's split name convention to keras's convention)
        _train, _val, _test = 'train', 'test', 'eval'

        labels = {}
        for split in (_train, _val, _test):
            with open(label_files[split]) as label_file:
                labels[split] = np.array([label.split()[1] for label in label_file],
                                          dtype=np.int)

        return labels

    def _preprocess_data(self, x_train, x_val, x_test, data_id='data'):
        self._log.debug('Preprocessing {}.'.format(data_id))
        data_split = [x_train, x_val, x_test]

        n_features = x_train.shape[1]
        
        # some sort of data normalization is always required for pca
        if self._quantization: # quantization requires data in range [0, 1] 
            scaler = Normalizer(norm='l2').fit(data_split[0])
        else:
            scaler = StandardScaler().fit(data_split[0])

        self._log.debug('Applying data normalization to {}.'.format(data_id))
        data_split = [scaler.transform(x) for x in data_split]

        assert self._min_dim >= 1
        
        if n_features < self._min_dim:
            self._log.debug("Skip Q-sparsity and dim reduction for {}." \
                            "Min number of dims: {}. Found: {}" \
                            .format(data_id, self._min_dim, n_features))
            return tuple(data_split)        
        
        if self._quantization:
            assert self._Q >= 1
            # small Q promotes sparsity
            self._log.debug('Applying Q-sparsity Q={} to {}'.format(self._Q, data_id))
            data_split = [np.floor(self._Q * x) for x in data_split]

        if not self._dim_reduction:
            return tuple(data_split)
            
        if data_id in self._precomputed_pca and not self._refit_pca:
            # use precomputed model
            self._log.debug('Using precomputed PCA for {}'.format(data_id))            
            pca = self._precomputed_pca[data_id]
        else:
            assert self._pca_conf > 0
            # compute pca from scratch            
            if 0 < self._pca_conf <= 1:
                # running pca with min explained variance takes much longer
                self._log.debug('Fitting full PCA for {}'.format(data_id))
                pca = PCA(self._pca_conf)
                pca.fit(data_split[0])
            else:
                assert self._max_dim >= 1
                assert self._min_expl_var > 0

                # search starts in the given number of components
                n_components = self._pca_conf
                # search min number of components with required expl. var
                for retry in range(self._max_pca_retries): # max number of retries
                    self._log.debug('Fitting fast PCA retry {} for {}'.format(retry+1, data_id))
                    # running pca with number of components is much faster
                    pca = PCA(n_components, svd_solver='randomized')
                    pca.fit(data_split[0])

                    expl_var = np.sum(pca.explained_variance_ratio_)
                    # sometimes pca fails to compute the expl. var (is set to NaN)
                    if (not np.isnan(expl_var) and expl_var < self._min_expl_var and n_components < self._max_dim):
                        n_components *= 2 # exponential search
                    else:
                        # if pca fails or the min expl. var is achieved or max retries
                        break # stop trying
            # store pca coefficients for future use
            self._log.debug('Storing PCA fit for {}'.format(data_id))
            self._precomputed_pca[data_id] = pca

        explained_var = np.sum(pca.explained_variance_ratio_)
        n_components = pca.n_components_
        msg = 'Applying PCA with explained var {} dims {} to {}'
        self._log.debug(msg.format(explained_var, n_components, data_id))
        # pca transformationl
        data_split = [pca.transform(x) for x in data_split]

        return tuple(data_split)

# Fake call to main to process inputs arguments

In [None]:
args = [
    "--project_dir", "/home/shared/Documents/final_proj",
    "--epochs", "30",
    "--batch_size", "256",
]

conf = main(args)

# Loading precomputed features and labels (may take some time)

In [None]:
helper = SocialClassifierWithPreComputedFeatures(conf.STORED_FEATURES_DIR, 
                                                 conf.LABEL_FILES)

# load features and labels
helper.load_data()

In [None]:
helper.list_attributes()

In [None]:
# configure dimensionality reduction
helper._min_dim = 25
helper._max_dim = 200
helper._max_pca_retries = 3
helper._pca_conf = 200
helper._min_expl_var = 0.95
helper._quantization = False
helper._dim_reduction = True
helper._Q = 32
helper._refit_pca = False

# Select attributes (default all), prepare splits

In [None]:
attribute_selector = AttributeSelector(helper.list_attributes())

# all / face / body / or single attribute (accept name substring, e.g. activity)
attributes_query = 'all'
# expand all / face / body / single attribute
selected_attributes = attribute_selector.filter(attributes_query)
helper._log.info('Selected attribute(s): {}'.format(attributes_query))

# prepare splits for selected attributes
data_split = helper.get_data_split(selected_attributes, preprocess=True)

# prepate data for keras (multiple outputs for domain/relation and one-hot encoding)
keras_data_split = helper.prepare_data(data_split)

# class_weight for keras (balance domain/relation instances)
class_weight = compute_class_weight_relation_domain(data_split[3])

# Initialize the model

In [None]:
# after preparing data (needs input dimensions)

# helper.init_model('top_down')

# allows more flexibility
helper.set_custom_model(
    create_model_top_down(helper._n_features),
    optimizer=keras.optimizers.Adam(0.0001, decay=1e-6),
    metrics=['accuracy'],
)

# Training

In [None]:
batch_size = conf.BATCH_SIZE
epochs = conf.EPOCHS

# FIXME: set directory correctly
checkpoint_path = os.path.join(egosocial.config.MODELS_CACHE_DIR, 'multi_attribute',
                               'weights.{epoch:02d}-{val_loss:.2f}.h5')
callbacks = [
#            ModelCheckpoint(
#                filepath=checkpoint_path, 
#                monitor='val_loss', save_best_only=True
#            ),
    ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.1, patience=10, min_lr=0.00001
    ),
    PlotLearning(update_step=1),
]


hist = helper.fit(
    keras_data_split['train'], 
    keras_data_split['val'],
    batch_size=batch_size, epochs=epochs,
    callbacks=callbacks,
#    class_weight=class_weight,
    verbose=0, 
)

# Evaluation

In [None]:
scores = helper.evaluate(
    keras_data_split['test'],
    batch_size=batch_size
)
for score, metric_name in zip(scores, helper.model.metrics_names):
    helper._log.info("%s : %0.4f" % (metric_name, score))