In [533]:
import os
import sys
from __future__ import print_function

PROJECT_DIR = '/root/shared/Documents/final_proj'
BASE_MODELS_DIR = os.path.join(PROJECT_DIR, 'models/trained_models')
ATTR_MODELS_DIR = os.path.join(BASE_MODELS_DIR, 'attribute_models')
BASE_FEATURES_DIR = os.path.join(PROJECT_DIR, 'extracted_features')
SVM_MODELS_DIR = os.path.join(PROJECT_DIR, 'models/svm_models')
SPLITS_DIR = os.path.join(PROJECT_DIR,'datasets/splits/annotator_consistency3')
STATS_MODELS_DIR = os.path.join(SVM_MODELS_DIR, 'stats')

ARCH = 'caffeNet'
LAYER = 'fc7'

DOMAIN='domain'
RELATION='relation'
DATA_TYPE=DOMAIN

CONFIG = LAYER + '_' + DATA_TYPE + '_' + ARCH

if DATA_TYPE == DOMAIN:
    labels_path = os.path.join(SPLITS_DIR,'domain_single_body1_{}_5.txt')
else:
    labels_path = os.path.join(SPLITS_DIR,'single_body1_{}_16.txt')

end2end_model = True

if end2end_model:
    FEATURES_DIR = os.path.join(BASE_FEATURES_DIR, 'end_to_end_features', CONFIG)    
else:    
    FEATURES_DIR = os.path.join(BASE_FEATURES_DIR, 'attribute_features', CONFIG)

stored_features_dir = os.path.join(FEATURES_DIR, 'all_splits_numpy_format')

In [534]:
import caffe
from caffe.proto import caffe_pb2
import leveldb
import numpy as np
import scipy.io

def load_levelDB_as_array(db_path):
    db = leveldb.LevelDB(db_path)
    datum = caffe_pb2.Datum()

    items = []

    for key, value in db.RangeIter():
        datum.ParseFromString(value)
        data = caffe.io.datum_to_array(datum)
        items.append(data)

    result = np.array(items).reshape(len(items), len(items[0]))
    return result

def preprocess_attributes(levelDB_dirs=None, raw_numpy_dirs=None, matlab_dirs=None):

    splits = ['train', 'test', 'eval']
    attribute_features = {split:{} for split in splits}

    ###########################################################################
    # features in levelDB format
    if levelDB_dirs:
        for directory in levelDB_dirs:
            for split in splits:
                attribute_models = os.listdir(os.path.join(directory, split))

                for attr_name in attribute_models:
                    features = load_levelDB_as_array(os.path.join(directory, split, attr_name))
                    attribute_features[split][attr_name] = features
                    print("Convert from levelDB format {} dataset attribute {} with dim: {}".format(split, attr_name, features.shape))

    ###########################################################################
    # features in numpy format
    if raw_numpy_dirs:
        for directory in raw_numpy_dirs:
            for numpy_file in os.listdir(directory):
                filename, ext = os.path.splitext(numpy_file)
                if ext.lower() == '.npy':
                    # load numpy
                    features = np.load(os.path.join(directory, numpy_file))
                    # find split
                    for candidate_split in splits:
                        if candidate_split in filename:
                            split = candidate_split
                            break
                    else:
                        split = None
                    # the folder name is the attribute name
                    attr_name = os.path.basename(directory)

                    attribute_features[split][attr_name] = features
                    print("Load numpy format {} dataset attribute {} with dim: {}".format(split, attr_name, features.shape))
            
    ###########################################################################
    # features in matlab format
    if matlab_dirs:
        for directory in matlab_dirs:
            for matfile in os.listdir(directory):
                filename, ext = os.path.splitext(matfile)
                if ext.lower() == '.mat':
                    # load matfile (dict format)
                    matfile_dict = scipy.io.loadmat(os.path.join(directory, matfile))
                    attr_name, split = filename.rsplit('_', 1)
                    # access numpy field
                    features = matfile_dict[attr_name]
                    attribute_features[split][attr_name] = features
                    print("Convert from matlab format {} dataset attribute {} with dim: {}".format(split, attr_name, features.shape))

    ###########################################################################
    
    return attribute_features

def save_features(attribute_features, features_dir, compressed=True):
    if not (os.path.exists(features_dir) and os.path.isdir(features_dir)):
        os.mkdir(features_dir)
    
    for split, attributes in attribute_features.items():
        for attr_name, features in attributes.items():
            features_path = os.path.join(features_dir, '{}_{}').format(attr_name, split)
            
            # save file in compress format and float16
            if compressed:
                np.savez_compressed(features_path, features.astype(np.float16))
            else:
                np.save(features_path, features)
            
            print("Saved {}.{} ...".format(features_path, 'npz' if compressed else 'np'))

def load_features(features_dir):
    attribute_features = {split:{} for split in ['train', 'test', 'eval']}

    for numpy_file in os.listdir(features_dir):        
        # Split the extension from the path and normalise it to lowercase.        
        filename, ext = os.path.splitext(numpy_file)
        ext = ext.lower()

        # path
        features_path = os.path.join(features_dir, numpy_file)
        
        if ext == '.npz':
            with np.load(features_path) as data:
                features = data['arr_0']
        elif ext == '.npy':
            features = np.load(features_path)
        else:
            continue

        attr_name, split = filename.rsplit('_', 1)
        attribute_features[split][attr_name] = features

        print("Loading {}...".format(features_path))
    
    return attribute_features

In [535]:
process_features = False

# preprocess features from original formats (leveldb, numpy, matlab)
if process_features:
    if end2end_model:
        # LEVELDB DIRS
        levelDB_dirs = [FEATURES_DIR]
        attribute_features = preprocess_attributes(levelDB_dirs)
    else:
        # LEVELDB DIRS
        levelDB_dirs = [FEATURES_DIR]
        # MATLAB DIRS
        matlab_dirs = [os.path.join(ATTR_MODELS_DIR,'localation_scale_data(annotator_consistency3)')]        
        # NUMPY DIRS
        numpy_dirs = [os.path.join(ATTR_MODELS_DIR, 'imsitu_body_activity(annotator_consistency3)'),
                      os.path.join(ATTR_MODELS_DIR, 'body_immediacy(annotator_consistency3)')]          

        attribute_features = preprocess_attributes(levelDB_dirs, numpy_dirs, matlab_dirs)

    if not (os.path.exists(stored_features_dir) and os.path.isdir(stored_features_dir)):
        os.mkdir(stored_features_dir)
    
    # save features to disk
    save_features(attribute_features, stored_features_dir, compressed=True)

else:
    # load features from disk
    attribute_features = load_features(stored_features_dir)

Loading /root/shared/Documents/final_proj/extracted_features/end_to_end_features/fc7_domain_caffeNet/all_splits_numpy_format/domain_body_train.npz...
Loading /root/shared/Documents/final_proj/extracted_features/end_to_end_features/fc7_domain_caffeNet/all_splits_numpy_format/domain_body_eval.npz...
Loading /root/shared/Documents/final_proj/extracted_features/end_to_end_features/fc7_domain_caffeNet/all_splits_numpy_format/domain_body_test.npz...


In [536]:
# attribute selector

all_attributes = sorted(attribute_features['test'].keys())
get_attr_by_keyword = lambda list_attrs, single_attr: [attr_name for attr_name in list_attrs if single_attr in attr_name]

body_attributes = get_attr_by_keyword(all_attributes, 'body')
face_attributes = get_attr_by_keyword(all_attributes, 'face') + get_attr_by_keyword(all_attributes, 'head')
selector = {'all': all_attributes, 'body': body_attributes, 'face': face_attributes}

In [537]:
def attribute_iter(attributes):
    unique_attrs = set([attr[:-2] if attr.endswith('_1') or attr.endswith('_2') else attr for attr in attributes])

    for item in sorted(unique_attrs):
        yield item

if end2end_model:
    seq = attribute_iter(all_attributes)
else:
    seq = attribute_iter(['all', 'body', 'face'] + all_attributes)

In [538]:
query = seq.next()
query

'domain_body'

In [539]:
# select all/body/face or use single attribute (prefix is enough, e.g. body_gender)

query = 'all'
selected_attributes = selector[query] if query in selector else get_attr_by_keyword(all_attributes, query)
selected_attributes

['domain_body']

In [540]:
# concatenate attributes

fused_features = {}
labels = {}

for split in ['train', 'test', 'eval']:
    selected_features = [attribute_features[split][attr_name] for attr_name in selected_attributes]
    fused_features[split] = np.concatenate(selected_features, axis=1)

    with open(labels_path.format(split)) as file_label_list:
        labels[split] = np.array([file_label.split()[1] for file_label in file_label_list], dtype=np.int)

In [541]:
# define splits

X_train = fused_features['train']
y_train = labels['train']
X_test = fused_features['eval']
y_test = labels['eval']
X_val = fused_features['test']
y_val = labels['test']

In [542]:
from sklearn.preprocessing import StandardScaler

# normalize data ?
normalize = False
if normalize:
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test  = scaler.transform(X_test)
    X_val = scaler.transform(X_val)

In [543]:
# configurations

# max number of iterations
max_iter = 1000
# deal with unbalanced classes by using penalization weights
fix_unbalanced=False
if fix_unbalanced:
    class_weight = 'balanced'
else:
    class_weight = None

# defined classifier format
# prefix: as needed (e.g. query) | mtype: end2end or attr |
# nnarch: e.g. caffeNet or VGG | dtype: domain or relation |
# clf: description of the classifier | 
# ext: file extension (b for serialized binary objects, txt for text)
# HINT: ordering chosen to match FEATURES_DIR format
FILE_FORMAT = '{prefix}_{mtype}_{layer}_{dtype}_{nnarch}_{clf}'

# pickle dumps extension
CLF_DUMP_EXT = '.b'

# fill file format
load_model_prefix = query
model_type = 'end2end' if end2end_model else 'attr'
balance_descr = 'balanced' if class_weight == 'balanced' else 'unbalanced'
clf_description = '{}_loss_{}_epochs_{}_{}'.format('sgd', 'squared_hinge', max_iter, balance_descr)

PREFILLED_FILE_FORMAT = FILE_FORMAT.format(prefix=load_model_prefix, mtype=model_type, layer=LAYER,
                                           nnarch=ARCH, dtype=DATA_TYPE, clf=clf_description)
pretrained_bin = PREFILLED_FILE_FORMAT + CLF_DUMP_EXT

# (pre)trained SVM path to load/save from/to disk
load_model_path = os.path.join(SVM_MODELS_DIR, pretrained_bin)

In [544]:
# train a svm model from scratch or load it from disk when possible 

import pickle
from sklearn.linear_model import SGDClassifier

# reuse precomputed model?
reuse_model = True

# automatically computed (don't modify)
compute_model = not reuse_model

if reuse_model:    
    if os.path.exists(load_model_path):
        print("Loading precomputed SGD (SVM loss) from disk...")
        with open(load_model_path, 'rb') as f:
            clf = pickle.load(f)
    else:
        # if model is not found, recompute from scratch
        print("File not found: {}".format(load_model_path))
        compute_model = True

if compute_model:
    clf = SGDClassifier(loss="squared_hinge", alpha=0.0001, max_iter=max_iter, n_jobs=-1, 
                        average=True, tol=1e-3, class_weight=class_weight)

    print("Training SGD (SVM loss)  from scratch...")
    clf.fit(X_train, y_train)

File not found: /root/shared/Documents/final_proj/models/svm_models/all_end2end_fc7_domain_caffeNet_sgd_loss_squared_hinge_epochs_1000_unbalanced.b
Training SGD (SVM loss)  from scratch...


In [545]:
print(clf)

SGDClassifier(alpha=0.0001, average=True, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='squared_hinge', max_iter=1000,
       n_iter=None, n_jobs=-1, penalty='l2', power_t=0.5,
       random_state=None, shuffle=True, tol=0.001, verbose=0,
       warm_start=False)


In [546]:
# keep training

extend_training = False
extra_iters = 200

# for experimenting purposes, consider changing the file name
# before storing the model 
if extend_training:
    clf.max_iter = extra_iters
    clf.warm_start = True
    print("Extending training of SGD for {} iterations...".format(extra_iters))
    clf.fit(X_train, y_train)

In [547]:
def print_statistics(val_stats=None, test_stats=None, fdesc=sys.stdout):
    for description, stats in [('Validation set:', val_stats), (('Test set:', test_stats))]:
        
        if stats is not None:
            print(description, file=fdesc)
            accuracy, confusion_matrix, report = stats
            print('Confusion matrix:', file=fdesc)
            print(confusion_matrix, file=fdesc)
            print(file=fdesc)
            print(report, file=fdesc)
            print('SGD accuracy: {:.3f}'.format(accuracy), file=fdesc)
            print('------------------------------------------------', file=fdesc)

In [548]:
import sklearn.metrics

def compute_stats(X, y, clf):
    y_predicted = clf.predict(X)
    acc = sklearn.metrics.accuracy_score(y, y_predicted)
    confusion_matrix = sklearn.metrics.confusion_matrix(y, y_predicted)
    report = sklearn.metrics.classification_report(y, y_predicted)

    return acc, confusion_matrix, report

val_stats, test_stats = compute_stats(X_val, y_val, clf), compute_stats(X_test, y_test, clf)
print_statistics(val_stats=val_stats, test_stats=test_stats)

Validation set:
Confusion matrix:
[[  3  34  17   0  26]
 [  0 127   7   1 155]
 [  1  15   7   0  26]
 [  1  10   0   0  18]
 [  4 130   0   1 126]]

             precision    recall  f1-score   support

          0       0.33      0.04      0.07        80
          1       0.40      0.44      0.42       290
          2       0.23      0.14      0.18        49
          3       0.00      0.00      0.00        29
          4       0.36      0.48      0.41       261

avg / total       0.35      0.37      0.34       709

SGD accuracy: 0.371
------------------------------------------------
Test set:
Confusion matrix:
[[  82  216    4    0   82]
 [ 158 1280   36    3  600]
 [  23  114   24    0  152]
 [  16   41    4    2  129]
 [  37  323    3    0 1777]]

             precision    recall  f1-score   support

          0       0.26      0.21      0.23       384
          1       0.65      0.62      0.63      2077
          2       0.34      0.08      0.12       313
          3       0.40 

In [549]:
# store statistics in disk

STATS_EXT = '.txt'
save_stats = False

if save_stats:
    # create directory if it doesn't exist already
    if not (os.path.exists(STATS_MODELS_DIR) and os.path.isdir(STATS_MODELS_DIR)):
        os.mkdir(STATS_MODELS_DIR)

    stats_file = PREFILLED_FILE_FORMAT + STATS_EXT
    stats_path = os.path.join(STATS_MODELS_DIR, stats_file)

    with open(stats_path, 'wt') as f:
        print("Storing statistics in {}...".format(stats_path)) # to stdout
        print_statistics(val_stats=val_stats, test_stats=test_stats, fdesc=f) # to file

In [550]:
# save trained model in serialized binary format

save_model = False
if save_model:
    # a different name could be used
    store_model_path = os.path.join(SVM_MODELS_DIR, pretrained_bin)
    
    if not (os.path.exists(SVM_MODELS_DIR) and os.path.isdir(SVM_MODELS_DIR)):
        os.mkdir(SVM_MODELS_DIR)
        
    with open(store_model_path, 'wb') as f:
        pickle.dump(clf, f)

In [551]:
# train other classifiers

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC

train_rf = False
if train_rf:
    clf_rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight=class_weight)
    print("Training Random Forest from scratch...")
    clf_rf.fit(X_train, y_train)
    print_statistics(test_stats=compute_stats(X_test, y_test, clf_rf))
    
train_svm = False
if train_svm:
    C = 1.0
    clf_svm = LinearSVC(C=C, class_weight='balanced')
    
    print("Training LinearSVC C={} from scratch...".format(C))
    clf_svm.fit(X_train, y_train)
    print_statistics(test_stats=compute_stats(X_test, y_test, clf_svm))