## Use Theanets to Implement Word/Image Vector fusion
- [source of both idea and code](https://github.com/mganjoo/zslearning)
- [a theano based implementation] - http://nbviewer.ipython.org/github/renruoxu/data-fusion/blob/master/deprecated/mapping%20(1).ipynb
- it is a standard 1-hidden layer MLP with customized cost function
- the data we use here is that: X (image vectors from DeCaff), Y (word vectors from word2vec)

In [71]:
import theanets
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, pairwise_distances_argmin, confusion_matrix
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec

In [32]:
LABELS = np.array(["airplane", "automobile", "bird","cat",
                        "deer","dog","frog", "horse", "ship","truck"])
word2vec = Word2Vec.load_word2vec_format("../data/word2vec.bin", binary = True)
label_vecs = np.vstack([word2vec[w] for w in LABELS],)

store = pd.HDFStore("../data/cifa_XY.hd5")
X = store["X/"].get_values()
y = store["Y/"].get_values()
labels = pairwise_distances_argmin(y, label_vecs)

In [33]:
## we train a mapping model with data excluding dogs
## and later test whether the dog images are correctly mapped to word dog

dog_index = (labels == 5) ## 5 is dog in LABELS
X_nodog, y_nodog = X[~dog_index], y[~dog_index]
X_dog, y_dog = X[dog_index], y[dog_index]
label_nodog, label_dog = labels[~dog_index], labels[dog_index]

In [38]:
def train_model(X, y):
    
    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 10000)
    
    ss_X = StandardScaler().fit(train_X)
    ss_y = StandardScaler().fit(train_y)
    scaled_train_X, scaled_valid_X = ss_X.transform(train_X), ss_X.transform(valid_X)
    scaled_train_y, scaled_valid_y = ss_y.transform(train_y), ss_y.transform(valid_y)
    
    exp = theanets.Experiment(theanets.Regressor, layers = (4096, (200, "tanh"), 100))
    for train, valid in exp.itertrain(train_set = (scaled_train_X, scaled_train_y), 
                                  valid_set = (scaled_valid_X, scaled_valid_y), 
                                  optimize = "sgd", learning_rate = 0.005, validate_every = 5,
                                  hidden_l1 = 0.01, weight_l2 = 1e-4):
        print 'train loss(err)', train['loss'], "(%g)" % train["err"], 'valid loss(err)', valid['loss'], "(%g)" % valid['err']
    return ss_X, ss_y, exp.network

ss_X, ss_y, model = train_model(X_nodog, y_nodog)

train loss(err) 1.67341068732 (0.592448) valid loss(err) 3.58908356034 (2.24289)
train loss(err) 1.33657578526 (0.434573) valid loss(err) 3.58908356034 (2.24289)
train loss(err) 1.18820181879 (0.393524) valid loss(err) 3.58908356034 (2.24289)
train loss(err) 1.08770575601 (0.369019) valid loss(err) 3.58908356034 (2.24289)
train loss(err) 1.01357760272 (0.351119) valid loss(err) 3.58908356034 (2.24289)
train loss(err) 0.952931391269 (0.33796) valid loss(err) 0.995105462689 (0.356418)
train loss(err) 0.901132806375 (0.327501) valid loss(err) 0.995105462689 (0.356418)
train loss(err) 0.857481872267 (0.318623) valid loss(err) 0.995105462689 (0.356418)
train loss(err) 0.821514604848 (0.311487) valid loss(err) 0.995105462689 (0.356418)
train loss(err) 0.789241926549 (0.30546) valid loss(err) 0.995105462689 (0.356418)
train loss(err) 0.760073281587 (0.300856) valid loss(err) 0.805549274206 (0.32808)
train loss(err) 0.730566200187 (0.293762) valid loss(err) 0.805549274206 (0.32808)
train loss(

In [41]:
def predict_by_model(xscaler, yscaler, model, X):
    yhat = yscaler.inverse_transform(model.predict(xscaler.transform(X)))
    return pairwise_distances_argmin(yhat, label_vecs)

In [69]:
## for images already seen
yhat_nodog = predict_by_model(ss_X, ss_y, model, X_nodog)
NODOG_LABELS = [l for l in LABELS if l != "dog"]
cm = pd.DataFrame(confusion_matrix(label_nodog, yhat_nodog), 
                  index = NODOG_LABELS, columns=NODOG_LABELS)
cm

Unnamed: 0,airplane,automobile,bird,cat,deer,frog,horse,ship,truck
airplane,5314,28,124,29,46,17,48,248,146
automobile,99,5272,12,14,6,6,16,49,526
bird,133,5,4902,409,265,156,94,26,10
cat,34,4,306,5052,174,226,146,28,30
deer,46,2,306,217,5089,148,157,24,11
frog,19,2,141,310,148,5324,34,13,9
horse,22,3,94,339,224,13,5253,22,30
ship,179,22,29,17,6,7,17,5635,88
truck,77,88,13,19,10,14,25,62,5692


In [72]:
## for unseen (dog) images - how are they map to the text
yhat_dog = predict_by_model(ss_X, ss_y, model, X_dog)
Counter(LABELS[yhat_dog])

Counter({'cat': 3830, 'bird': 629, 'horse': 622, 'deer': 522, 'frog': 317, 'truck': 32, 'airplane': 24, 'ship': 16, 'automobile': 8})

In [73]:
np.unique(yhat_dog)

array([0, 1, 2, 3, 4, 6, 7, 8, 9])