In [548]:
import os
from __future__ import print_function

PROJECT_DIR = '/root/shared/Documents/final_proj'
BASE_MODELS_DIR = os.path.join(PROJECT_DIR, 'models/trained_models')
ATTR_MODELS_DIR = os.path.join(BASE_MODELS_DIR, 'attribute_models')
BASE_FEATURES_DIR = os.path.join(PROJECT_DIR, 'extracted_features')
SVM_MODELS_DIR = os.path.join(PROJECT_DIR, 'models/svm_models')
STATS_MODELS_DIR = os.path.join(SVM_MODELS_DIR, 'stats')

LAYER = 'fc7'

DOMAIN='domain'
RELATION='relation'
DATA_TYPE=DOMAIN

LAYER_DIR = LAYER + '_' + DATA_TYPE
if DATA_TYPE == DOMAIN:
    labels_path = '../../datasets/splits/relation_consistency3/domain_single_body1_{}_5.txt'
else:
    labels_path = '../../datasets/splits/annotator_consistency3/single_body1_{}_16.txt'    

end2end_model = True
if end2end_model:
    FEATURES_DIR = os.path.join(BASE_FEATURES_DIR, 'end_to_end_features', LAYER_DIR)
    stored_features_dir = os.path.join(FEATURES_DIR, 'all_splits_numpy_format')    
else:    
    FEATURES_DIR = os.path.join(BASE_FEATURES_DIR, 'attribute_features', LAYER_DIR)
    stored_features_dir = os.path.join(FEATURES_DIR, 'all_splits_numpy_format')

In [561]:
import caffe
from caffe.proto import caffe_pb2
import leveldb
import numpy as np
import scipy.io

def load_levelDB_as_array(db_path):
    db = leveldb.LevelDB(db_path)
    datum = caffe_pb2.Datum()

    items = []

    for key, value in db.RangeIter():
        datum.ParseFromString(value)
        data = caffe.io.datum_to_array(datum)
        items.append(data)

    result = np.array(items).reshape(len(items), len(items[0]))
    return result

def preprocess_attributes(levelDB_dirs=None, raw_numpy_dirs=None, matlab_dirs=None):

    splits = ['train', 'test', 'eval']
    attribute_features = {split:{} for split in splits}

    ###########################################################################
    # features in levelDB format
    if levelDB_dirs:
        for directory in levelDB_dirs:
            for split in splits:
                attribute_models = os.listdir(os.path.join(directory, split))

                for attr_name in attribute_models:
                    features = load_levelDB_as_array(os.path.join(directory, split, attr_name))
                    attribute_features[split][attr_name] = features
                    print("Convert from levelDB format {} dataset attribute {} with dim: {}".format(split, attr_name, features.shape))

    ###########################################################################
    # features in numpy format
    if raw_numpy_dirs:
        for directory in raw_numpy_dirs:
            for numpy_file in os.listdir(directory):
                filename, ext = os.path.splitext(numpy_file)
                if ext.lower() == '.npy':
                    # load numpy
                    features = np.load(os.path.join(directory, numpy_file))
                    # find split
                    for candidate_split in splits:
                        if candidate_split in filename:
                            split = candidate_split
                            break
                    else:
                        split = None
                    # the folder name is the attribute name
                    attr_name = os.path.basename(directory)

                    attribute_features[split][attr_name] = features
                    print("Load numpy format {} dataset attribute {} with dim: {}".format(split, attr_name, features.shape))
            
    ###########################################################################
    # features in matlab format
    if matlab_dirs:
        for directory in matlab_dirs:
            for matfile in os.listdir(directory):
                filename, ext = os.path.splitext(matfile)
                if ext.lower() == '.mat':
                    # load matfile (dict format)
                    matfile_dict = scipy.io.loadmat(os.path.join(directory, matfile))
                    attr_name, split = filename.rsplit('_', 1)
                    # access numpy field
                    features = matfile_dict[attr_name]
                    attribute_features[split][attr_name] = features
                    print("Convert from matlab format {} dataset attribute {} with dim: {}".format(split, attr_name, features.shape))

    ###########################################################################
    
    return attribute_features

def save_features(attribute_features, features_dir):
    for split, attributes in attribute_features.items():
        for attr_name, features in attributes.items():
            features_path = os.path.join(features_dir, '{}_{}').format(attr_name, split)
            
            # save file in compress format and float16
            # np.save(features_path, features)
            np.savez_compressed(features_path, features.astype(np.float16))
            
            print("Saved {}...".format(features_path))

def load_features(features_dir):
    attribute_features = {split:{} for split in ['train', 'test', 'eval']}

    for numpy_file in os.listdir(features_dir):        
        # Split the extension from the path and normalise it to lowercase.        
        filename, ext = os.path.splitext(numpy_file)
        ext = ext.lower()

        # path
        features_path = os.path.join(features_dir, numpy_file)
        
        if ext == '.npz':
            with np.load(features_path) as data:
                features = data['arr_0']
        elif ext == '.npy':
            features = np.load(features_path)
        else:
            continue

        attr_name, split = filename.rsplit('_', 1)
        attribute_features[split][attr_name] = features

        print("Loading {}...".format(features_path))
    
    return attribute_features

In [562]:
process_features = False

# preprocess features from original formats (leveldb, numpy, matlab)
if process_features:
    if end2end_model:
        # LEVELDB DIRS
        levelDB_dirs = [FEATURES_DIR]
        attribute_features = preprocess_attributes(levelDB_dirs)
        pass
    else:
        if DATA_TYPE == RELATION:
            # LEVELDB DIRS
            levelDB_dirs = [FEATURES_DIR]
            # MATLAB DIRS
            matlab_dirs = [os.path.join(ATTR_MODELS_DIR,'localation_scale_data(annotator_consistency3)')]        
            # NUMPY DIRS
            numpy_dirs = [os.path.join(ATTR_MODELS_DIR, 'imsitu_body_activity(annotator_consistency3)'),
                          os.path.join(ATTR_MODELS_DIR, 'body_immediacy(annotator_consistency3)')]
        else:
            # LEVELDB DIRS
            levelDB_dir = [FEATURES_DIR]
            # MATLAB DIRS
            matlab_dirs = None
            # NUMPY DIRS
            numpy_dirs = [os.path.join(ATTR_MODELS_DIR, 'imsitu_body_activity(relation_consistency3)')]            

        attribute_features = preprocess_attributes(levelDB_dirs, numpy_dirs, matlab_dirs)
    

    if not (os.path.exists(stored_features_dir) and os.path.isdir(stored_features_dir)):
        os.mkdir(stored_features_dir)
    
    # save features to disk
    save_features(attribute_features, stored_features_dir)

else:
    # load features from disk
    attribute_features = load_features(stored_features_dir)

Loading /root/shared/Documents/final_proj/extracted_features/end_to_end_features/fc7_domain/all_splits_numpy_format/domain_body_train.npy...
Loading /root/shared/Documents/final_proj/extracted_features/end_to_end_features/fc7_domain/all_splits_numpy_format/domain_body_test.npy...
Loading /root/shared/Documents/final_proj/extracted_features/end_to_end_features/fc7_domain/all_splits_numpy_format/domain_body_eval.npy...


In [569]:
# attribute selector

all_attributes = sorted(attribute_features['test'].keys())
get_attr_by_keyword = lambda list_attrs, single_attr: [attr_name for attr_name in list_attrs if single_attr in attr_name]

body_attributes = get_attr_by_keyword(all_attributes, 'body')
face_attributes = get_attr_by_keyword(all_attributes, 'face') + get_attr_by_keyword(all_attributes, 'head')
selector = {'all': all_attributes, 'body': body_attributes, 'face': face_attributes}

In [573]:
def attribute_iter(attributes):
    unique_attrs = set([attr[:-2] if attr.endswith('_1') or attr.endswith('_2') else attr for attr in attributes])

    for item in sorted(unique_attrs):
        yield item

if end2end_model:
    seq = attribute_iter(all_attributes)
else:
    seq = attribute_iter(['all', 'body', 'face'] + all_attributes)

In [574]:
query = seq.next()
query

'domain_body'

In [576]:
# select all/body/face or use single attribute (prefix is enough, e.g. body_gender)

#query = 'all'
selected_attributes = selector[query] if query in selector else get_attr_by_keyword(all_attributes, query)
selected_attributes

['domain_body']

In [577]:
# concatenate attributes

fused_features = {}
for split in ['train', 'test', 'eval']:
    selected_features = [attribute_features[split][attr_name] for attr_name in selected_attributes]
    fused_features[split] = np.concatenate(selected_features, axis=1)

labels = {}
for split in ['train', 'test', 'eval']:
    with open(labels_path.format(split)) as file_label_list:
        labels[split] = np.array([file_label.split()[1] for file_label in file_label_list], dtype=np.int)

In [578]:
# define splits

full_training_set = False
if full_training_set:
    X_train = np.concatenate([fused_features['train'], fused_features['test']])
    X_test = fused_features['eval']
    y_train = np.concatenate([labels['train'], labels['test']])
    y_test = labels['eval']
else:
    X_train = fused_features['train']
    y_train = labels['train']
    X_test = fused_features['eval']
    y_test = labels['eval']
    X_val = fused_features['test']
    y_val = labels['test']

In [579]:
from sklearn.preprocessing import StandardScaler

# normalize data ?
normalize = False
if normalize:
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test  = scaler.transform(X_test)
    
    if not full_training_set:
        X_val = scaler.transform(X_val)

In [630]:
# configurations

# max number of iterations
max_iter = 1000

# pickle dumps extension
CLF_DUMP_EXT = 'b'

# defined classifier format
FILE_FORMAT = '{prefix}_{mtype}_{dtype}_{clf}_{epochs}.{ext}'

load_model_prefix = query
model_type = 'end2end' if end2end_model else 'attr'
clf_description = 'sgd_squared_hinge'

pretrained_bin = FILE_FORMAT.format(prefix=load_model_prefix, mtype=model_type, clf=clf_description, 
                                    dtype=DATA_TYPE, epochs=max_iter, ext=CLF_DUMP_EXT)

load_model_path = os.path.join(SVM_MODELS_DIR, pretrained_bin)

In [657]:
import pickle
from sklearn.linear_model import SGDClassifier

# reuse precomputed model?
reuse_model = False

# automatically computed (don't modify)
compute_model = not reuse_model

if reuse_model:    
    if os.path.exists(load_model_path):
        print("Loading precomputed SGD (SVM loss) from disk...")
        with open(load_model_path, 'rb') as f:
            clf = pickle.load(f)
    else:
        # if model is not found, recompute from scratch
        print("File not found: {}".format(load_model_path))
        compute_model = True

if compute_model:
    clf = SGDClassifier(loss="squared_hinge", alpha=0.0001, max_iter=max_iter, n_jobs=-1, 
                        average=True, tol=1e-3, class_weight='balanced')

    print("Training SGD (SVM loss)  from scratch...")
    clf.fit(X_train, y_train)

Training SGD (SVM loss)  from scratch...


In [658]:
print(clf)

SGDClassifier(alpha=0.0001, average=True, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='squared_hinge', max_iter=1000,
       n_iter=None, n_jobs=-1, penalty='l2', power_t=0.5,
       random_state=None, shuffle=True, tol=0.001, verbose=0,
       warm_start=False)


In [659]:
# keep training
extend_training = False
extra_iters = 200

# for experimenting purposes, if you want to save
if extend_training:
    clf.max_iter = extra_iters
    clf.warm_start = True
    print("Extending training of SGD for {} iterations...".format(extra_iters))
    clf.fit(X_train, y_train)

In [660]:
import sklearn.metrics

y_predicted = clf.predict(X_test)
report = sklearn.metrics.classification_report(y_test, y_predicted)
acc = sklearn.metrics.accuracy_score(y_test, y_predicted)
confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_predicted)

print('Confusion matrix:')
print(confusion_matrix)
print()
print(report)
print('SGD accuracy: {:.3f}'.format(acc))

Confusion matrix:
[[ 138  172   37    4   61]
 [ 326 1237  300   37  556]
 [  38   73   92    3  124]
 [  23   47   21   35   87]
 [  96  401   72   44 1747]]

             precision    recall  f1-score   support

          0       0.22      0.33      0.27       412
          1       0.64      0.50      0.56      2456
          2       0.18      0.28      0.22       330
          3       0.28      0.16      0.21       213
          4       0.68      0.74      0.71      2360

avg / total       0.59      0.56      0.57      5771

SGD accuracy: 0.563


In [646]:
# store statistics in disk

STATS_EXT = 'txt'
save_stats = True

if save_stats:
    # create directory if it doesn't exist already
    if not (os.path.exists(STATS_MODELS_DIR) and os.path.isdir(STATS_MODELS_DIR)):
        os.mkdir(STATS_MODELS_DIR)

    stats_file = FILE_FORMAT.format(prefix=load_model_prefix, mtype=model_type, clf=clf_description, 
                                    dtype=DATA_TYPE, epochs=max_iter, ext=STATS_EXT)        

    stats_path = os.path.join(STATS_MODELS_DIR, stats_file)
    with open(stats_path, 'wt') as f:
        print("Storing statistics in {}...".format(stats_path))
        
        # save statistics
        print('Confusion matrix:', file=f) 
        print(confusion_matrix, file=f )
        print(file=f)
        print(report, file=f)
        print('SGD accuracy: {:.3f}'.format(acc), file=f)        

Storing statistics in /root/shared/Documents/final_proj/models/svm_models/stats/domain_body_end2end_domain_sgd_squared_hinge_1000.txt...


In [647]:
store_model_prefix = query
used_epochs = max_iter #  modify this parameter if extend_training was used
store_model_path = os.path.join(SVM_MODELS_DIR, clf_name_format.format(prefix=store_model_prefix, mtype=model_type,
                                                                       clf=clf_description, dtype=DATA_TYPE, epochs=used_epochs))
save_model = True
if save_model:
    if not (os.path.exists(SVM_MODELS_DIR) and os.path.isdir(SVM_MODELS_DIR)):
        os.mkdir(SVM_MODELS_DIR)
        
    with open(store_model_path, 'wb') as f:
        pickle.dump(clf, f)

In [667]:
# train other classifiers

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC

train_rf = False
if train_rf:
    clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1, class_weight='balanced')
    print("Training Random Forest  from scratch...")
    clf_rf.fit(X_train, y_train)

    y_predicted = clf_rf.predict(X_test)

    print(sklearn.metrics.classification_report(y_test, y_predicted))
    print('RF accuracy: {:.3f}'.format(sklearn.metrics.accuracy_score(y_test, y_predicted)))

train_svm = False
if train_svm:
    C = 1.0
    clf_svm = LinearSVC(C=C, class_weight='balanced')
    
    print("Training LinearSVC C={} from scratch...".format(C))
    clf_svm.fit(X_train, y_train)

    y_predicted = clf_svm.predict(X_test)

    print(sklearn.metrics.classification_report(y_test, y_predicted))
    print('LinearSVC accuracy: {:.3f}'.format(sklearn.metrics.accuracy_score(y_test, y_predicted)))

Training LinearSVC C=1.0 from scratch...
             precision    recall  f1-score   support

          0       0.19      0.41      0.26       412
          1       0.63      0.45      0.53      2456
          2       0.15      0.33      0.20       330
          3       0.18      0.26      0.21       213
          4       0.72      0.63      0.67      2360

avg / total       0.59      0.51      0.54      5771

LinearSVC accuracy: 0.509
