In [58]:
from __future__ import division
import codecs
import json
from collections import Counter, defaultdict
import ConfigParser
import sys
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.utils import shuffle

In [5]:
config = ConfigParser.SafeConfigParser()
my_config = '/Users/das/work/svn/Gits/a_Projects/Projects/Infrastructure/dsg-vision/Config/default.cfg'
with codecs.open(my_config, 'r', encoding='utf-8') as f:
    config.readfp(f)

corpora_base = config.get('DEFAULT', 'corpora_base')

dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')

preproc_path = dsgv_home + '/Preproc/PreprocOut/'
feats_path = dsgv_home + '/ExtractFeats/ExtractOut/'

# The first features in the image feature Xs encode the region ID
ID_FEATS = 3

In [6]:
with open(preproc_path + 'refcoco_splits.json', 'r') as f:
    rc_splits = json.load(f)
    
X = np.load(feats_path + 'mscoco_vgg19-fc2.npz')['arr_0']

refcoco_refdf = pd.read_json(preproc_path + 'refcoco_refdf.json.gz',
                         typ='frame', orient='split', compression='gzip')

In [7]:
def filter_X_by_filelist(X, filelist):
    tmp_df = pd.DataFrame(X)
    return np.array(tmp_df[tmp_df.iloc[:,1].isin(filelist)])

def filter_refdf_by_filelist(refdf, filelist):
    return pd.merge(refdf, pd.DataFrame(filelist, columns=['image_id']))

In [10]:
X_t = filter_X_by_filelist(X, rc_splits['train'])
refdf_train = filter_refdf_by_filelist(refcoco_refdf, rc_splits['train'])

In [13]:
def create_word2den(refdf, refcol='refexp', regcol='region_id'):
    '''Given refdf, returns dict of occurences (id triples) of words from expressions.'''
    word2den = defaultdict(list)
    for _, row in refdf.iterrows():
        exprlist = row[refcol].split()
        # TODO: Could take filter function that filters out some occurences.
        #   E.g., tagger that tags whole expression & returns only the nouns.
        for word in exprlist:
            word_den_list = word2den[word].append((row['i_corpus'],
                                                   row['image_id'],
                                                   row[regcol]))
    return {k: list(set(v)) for k,v in word2den.items()}

In [14]:
%%time
word2den = create_word2den(refdf_train)

CPU times: user 19.8 s, sys: 1.23 s, total: 21 s
Wall time: 20.8 s


In [15]:
def make_X_id_index(X, id_feats=ID_FEATS):
    return dict(zip([tuple(e) for e in X[:,:id_feats].astype(int).tolist()], range(len(X))))

In [16]:
%%time
X_idx = make_X_id_index(X_t)

CPU times: user 440 ms, sys: 626 ms, total: 1.07 s
Wall time: 1.07 s


In [19]:
def make_mask_matrix(X, X_idx, word2den, wordlist):
    mask_matrix = []
    for this_word in wordlist:
        this_word_vec = np.zeros(len(X))
        if this_word in word2den:
            this_word_vec[[X_idx[i] for i in word2den[this_word] if i in X_idx]] = 1
        mask_matrix.append(this_word_vec)
    mask_matrix = np.array(mask_matrix, dtype=bool)
    return mask_matrix

In [20]:
mask_matrix = make_mask_matrix(X_t, X_idx, word2den, word2den.keys())

In [59]:
def make_train_for_word(X, word2den, mask_matrix, word, neg_max=None):
    if word not in word2den:
        #raise ValueError("No mask available for this word! (%s)" % (word))
        print "Error!! No mask available for this word! (%s)" % (word)
        return None
    this_mask = mask_matrix[word2den.keys().index(word)]
    X_pos = X[this_mask, ID_FEATS:]
    y_pos = np.ones(len(X_pos))
    X_neg = X[~this_mask, ID_FEATS:]
    # TODO: if neg_max, shuffle X_neg and limit to first neg_max rows..
    y_neg = np.zeros(len(X_neg))
    X_out = np.concatenate([X_pos, X_neg], axis=0)
    y_out = np.concatenate([y_pos, y_neg])
    return shuffle(X_out, y_out)

In [60]:
X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, 'cow')

In [61]:
X_this_w.shape, y_this_w.shape

((166354, 4106), (166354,))

In [65]:
y_this_w.sum() == len(word2den['cow'])

True

In [27]:
min_freq = 40

In [23]:
counts = mask_matrix.sum(axis=1)

In [28]:
wordlist = np.array(word2den.keys())[counts > min_freq]

In [33]:
classifier = linear_model.LogisticRegression(penalty='l1', warm_start=True)

In [None]:
wacs = []
for this_word in wordlist[:10]:
    print ".",
    X_this_w, y_this_w = make_train_for_word(X_t, word2den, mask_matrix, this_word)
    this_wac = classifier.fit(X_this_w, y_this_w)
    wacs.append((word, this_wac))

.

In [None]:
1

In [None]:
1

In [1]:
1

1

Ok, so this is where I am now:

* Getting the negative examples is rather slow, because it cuts out so much of X.. Investigate whether negating mask already is slow, and if I can speed this up.
* This presumably also makes the training slow (together with the fact that the vectors are so wide now). Investigate training in keras (and on GPU)?