In [1]:
from __future__ import division
import codecs
import json
from collections import Counter, defaultdict
import ConfigParser
import sys
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.utils import shuffle

from joblib import Parallel, delayed

In [2]:
%load_ext line_profiler

In [3]:
config = ConfigParser.SafeConfigParser()
my_config = '/Users/das/work/svn/Gits/a_Projects/Projects/Infrastructure/dsg-vision/Config/default.cfg'
with codecs.open(my_config, 'r', encoding='utf-8') as f:
    config.readfp(f)

corpora_base = config.get('DEFAULT', 'corpora_base')

dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')

preproc_path = dsgv_home + '/Preproc/PreprocOut/'
feats_path = dsgv_home + '/ExtractFeats/ExtractOut/'

# The first features in the image feature Xs encode the region ID
ID_FEATS = 3

In [4]:
with open(preproc_path + 'refcoco_splits.json', 'r') as f:
    rc_splits = json.load(f)
    
X = np.load(feats_path + 'mscoco_vgg19-fc2.npz')['arr_0']

refcoco_refdf = pd.read_json(preproc_path + 'refcoco_refdf.json.gz',
                         typ='frame', orient='split', compression='gzip')

In [5]:
def filter_X_by_filelist(X, filelist):
    tmp_df = pd.DataFrame(X)
    return np.array(tmp_df[tmp_df.iloc[:,1].isin(filelist)])

def filter_refdf_by_filelist(refdf, filelist):
    return pd.merge(refdf, pd.DataFrame(filelist, columns=['image_id']))

In [6]:
X_t = filter_X_by_filelist(X, rc_splits['train'])
refdf_train = filter_refdf_by_filelist(refcoco_refdf, rc_splits['train'])

In [7]:
def create_word2den(refdf, refcol='refexp', regcol='region_id'):
    '''Given refdf, returns dict of occurences (id triples) of words from expressions.'''
    word2den = defaultdict(list)
    for _, row in refdf.iterrows():
        exprlist = row[refcol].split()
        # TODO: Could take filter function that filters out some occurences.
        #   E.g., tagger that tags whole expression & returns only the nouns.
        for word in exprlist:
            word_den_list = word2den[word].append((row['i_corpus'],
                                                   row['image_id'],
                                                   row[regcol]))
    return {k: list(set(v)) for k,v in word2den.items()}

In [8]:
%%time
word2den = create_word2den(refdf_train)

CPU times: user 21.8 s, sys: 902 ms, total: 22.7 s
Wall time: 21.9 s


In [9]:
def make_X_id_index(X, id_feats=ID_FEATS):
    return dict(zip([tuple(e) for e in X[:,:id_feats].astype(int).tolist()], range(len(X))))

In [10]:
%%time
X_idx = make_X_id_index(X_t)

CPU times: user 284 ms, sys: 596 ms, total: 881 ms
Wall time: 880 ms


In [11]:
def make_mask_matrix(X, X_idx, word2den, wordlist):
    mask_matrix = []
    for this_word in wordlist:
        this_word_vec = np.zeros(len(X))
        if this_word in word2den:
            this_word_vec[[X_idx[i] for i in word2den[this_word] if i in X_idx]] = 1
        mask_matrix.append(this_word_vec)
    mask_matrix = np.array(mask_matrix, dtype=bool)
    return mask_matrix

In [12]:
%%time
mask_matrix = make_mask_matrix(X_t, X_idx, word2den, word2den.keys())

CPU times: user 3.43 s, sys: 6.42 s, total: 9.85 s
Wall time: 10.1 s


In [50]:
## N.B.: Replace with make_X_for_word from below! Can be used for extracting
##   test data as well..

def make_train_for_word(X, word2den, mask_matrix, word, neg_max=20000):
    if word not in word2den:
        #raise ValueError("No mask available for this word! (%s)" % (word))
        print "Error!! No mask available for this word! (%s)" % (word)
        return None
    this_mask = mask_matrix[word2den.keys().index(word)]
    X_pos = X[this_mask, ID_FEATS:]
    y_pos = np.ones(len(X_pos), dtype=int)
    
    neg_indx = np.arange(mask_matrix.shape[1])[~this_mask]
    np.random.shuffle(neg_indx)
    X_neg = X[neg_indx[:neg_max], ID_FEATS:]
    y_neg = np.zeros(len(X_neg), dtype=int)

    X_out = np.concatenate([X_pos, X_neg], axis=0)
    y_out = np.concatenate([y_pos, y_neg])
    return shuffle(X_out, y_out)

In [14]:
#%lprun -T prof1 -f make_train_for_word X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, 'cow')

Sped up `make_train_for_word` by limiting the size of the negative set. Was 40secs, now 3 secs. Still slower than I would like. But selecting a very large portion of the matrix with a boolean vector seems to be very slow. Maybe there is a more clever way to do it?

In [15]:
%%time
X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, 'cow')

CPU times: user 1.22 s, sys: 1.49 s, total: 2.71 s
Wall time: 1.79 s


In [16]:
mask_matrix.shape

(9364, 166354)

Reduce the set of words for which WAC is trained, by frequency:

In [17]:
min_freq = 40

counts = mask_matrix.sum(axis=1)

wordlist = np.array(word2den.keys())[counts > min_freq]

In [18]:
classifier = linear_model.LogisticRegression(penalty='l1', warm_start=True)

In [65]:
%%time
wacs = []
for this_word in wordlist[:10]:
    # print ".",
    X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, this_word)
    print this_word, X_this_w.shape[0]
    classifier = linear_model.LogisticRegression(penalty='l1', warm_start=True)
    this_wac = classifier.fit(X_this_w, y_this_w)
    wacs.append((this_word, this_wac))

yellow 20896
wooden 20053
hanging 20059
second 21922
kids 20056
glass 20533
hot 20186
wine 20184
backpack 20100
silver 20123
CPU times: user 2min 11s, sys: 15.2 s, total: 2min 26s
Wall time: 2min 19s


In [19]:
def train_this_word(X, word2den, mask_matrix, this_word):
    X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, this_word)
    print this_word, X_this_w.shape[0]
    classifier = linear_model.LogisticRegression(penalty='l1', warm_start=True)
    this_wac = classifier.fit(X_this_w, y_this_w)
    return (this_word, y_this_w.sum(), len(X_this_w), this_wac)

In [73]:
%%time
wacs = [train_this_word(X, word2den, mask_matrix, this_word)\
        for this_word in wordlist[:10]]

yellow 20896
wooden 20053
hanging 20059
second 21922
kids 20056
glass 20533
hot 20186
wine 20184
backpack 20100
silver 20123
CPU times: user 2min, sys: 11.5 s, total: 2min 12s
Wall time: 2min 3s


In [74]:
%%time
wacs = Parallel(n_jobs=4, require='sharedmem', prefer='threads')\
               (delayed(train_this_word)(X, word2den, mask_matrix, this_word)\
                for this_word in wordlist[:10])

second 21922
yellow 20896
hangingwooden 20053
 20059
kids 20056
glass 20533
hot 20186
wine 20184
backpack 20100
silver 20123
CPU times: user 2min 32s, sys: 25.2 s, total: 2min 57s
Wall time: 1min 38s


In [20]:
%%time
wacs = Parallel(n_jobs=2, require='sharedmem', prefer='threads')\
               (delayed(train_this_word)(X, word2den, mask_matrix, this_word)\
                for this_word in wordlist[:10])

wooden 20053
yellow 20896
hanging 20059
second 21922
kids 20056
glass 20533
hot 20186
wine 20184
backpack 20100
silver 20123
CPU times: user 2min 8s, sys: 20.8 s, total: 2min 29s
Wall time: 1min 31s


In [76]:
%%time
wacs = Parallel(n_jobs=1, require='sharedmem', prefer='threads')\
               (delayed(train_this_word)(X, word2den, mask_matrix, this_word)\
                for this_word in wordlist[:10])

yellow 20896
wooden 20053
hanging 20059
second 21922
kids 20056
glass 20533
hot 20186
wine 20184
backpack 20100
silver 20123
CPU times: user 2min 9s, sys: 11.8 s, total: 2min 21s
Wall time: 2min 11s


Distributing over two cores seems to be worth it. Diminishing returns for more cores.

Could still try to train on keras? https://gist.github.com/fchollet/b7507f373a3446097f26840330c1c378

In [21]:
wacs[0]

(u'yellow',
 896,
 20896,
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=True))

Remaining tasks:

* evaluation? Run models on training data (with smaller n_neg... maybe balanced? should be option in make_train... which might better be called make_word_dataset...)
* how to persist models.. Write out weight matrix and wordlist to disk, as numpy structures? scikit learn objects not very well serialisable?

In [22]:
def get_X_for_word(X, word2den, mask_matrix, word, neg_max=20000):
    if word not in word2den:
        #raise ValueError("No mask available for this word! (%s)" % (word))
        print "Error!! No mask available for this word! (%s)" % (word)
        return None
    this_mask = mask_matrix[word2den.keys().index(word)]
    X_pos = X[this_mask, ID_FEATS:]
    y_pos = np.ones(len(X_pos), dtype=int)
    
    if neg_max == 0:
        return X_pos, y_pos
    
    if neg_max == 'balanced':
        neg_max = len(y_pos)

    neg_indx = np.arange(mask_matrix.shape[1])[~this_mask]
    np.random.shuffle(neg_indx)
    X_neg = X[neg_indx[:neg_max], ID_FEATS:]
    y_neg = np.zeros(len(X_neg), dtype=int)

    X_out = np.concatenate([X_pos, X_neg], axis=0)
    y_out = np.concatenate([y_pos, y_neg])
    return shuffle(X_out, y_out)

In [53]:
for this_word, npos, _, this_clsf in wacs:
    print this_word, npos, '\t',
    X_tst, y_tst = get_X_for_word(X_t, word2den, mask_matrix, this_word, neg_max='balanced')
    print this_clsf.score(X_tst, y_tst)

yellow 896 	0.9899553571428571
wooden 53 	1.0
hanging 59 	1.0
second 1922 	0.9419875130072841
kids 56 	1.0
glass 533 	0.9971857410881801
hot 186 	1.0
wine 184 	0.9945652173913043
backpack 100 	0.995
silver 123 	1.0


Performance on training data (!) unsuprisingly pretty good...

In [57]:
rc_all_test = rc_splits['testA'] + rc_splits['testB']
X_ts = filter_X_by_filelist(X, rc_all_test)
refdf_test = filter_refdf_by_filelist(refcoco_refdf, rc_all_test)

In [59]:
word2den_ts = create_word2den(refdf_test)
X_idx_ts = make_X_id_index(X_ts)
mask_matrix_ts = make_mask_matrix(X_ts, X_idx_ts, word2den_ts, word2den_ts.keys())

In [60]:
for this_word, npos, _, this_clsf in wacs:
    print this_word, npos, '\t',
    X_tst, y_tst = get_X_for_word(X_ts, word2den_ts, mask_matrix_ts, this_word, neg_max='balanced')
    print this_clsf.score(X_tst, y_tst)

yellow 896 	0.6483516483516484
wooden 53 	0.6
hanging 59 	0.5
second 1922 	0.6525
kids 56 	0.5
glass 533 	0.7073170731707317
hot 186 	0.65
wine 184 	0.71875
backpack 100 	0.5833333333333334
silver 123 	0.5


It's looking not at all so great on the test set... (Although this is not the evaluation that is of ultimate interest here.)