In [7]:
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression

In [2]:
des_concat_train = np.load('./features/descriptions_train_concat_embed_vectors.npy')
des_concat_test = np.load('./features/descriptions_test_concat_embed_vectors.npy')
des_long_train = np.load('./features/descriptions_train_long_embed_vectors.npy')
des_long_test = np.load('./features/descriptions_test_long_embed_vectors.npy')
image_1000_train = np.load('./features/image_features_1000_train.npy')
image_1000_test = np.load('./features/image_features_1000_test.npy')
image_2048_train = np.load('./features/image_features_2048_train.npy')
image_2048_test = np.load('./features/image_features_2048_test.npy')

In [3]:
def get_nearest_20(vec, vec_set):
    dis = [np.linalg.norm(np.array(vec) - np.array(x)) for x in vec_set]
    return np.argsort(dis)[:20]

def map20score(y, pred):
    if y in pred:
        return (20 - pred.tolist().index(y)) / 20.0
    else:
        return 0

def map20eval(pls, des_vectors, image_vectors):
    image_pred = pls.predict(des_vectors)
    top_20 = [get_nearest_20(vec, image_vectors) for vec in image_pred]
    print(len(top_20))
    scores = [map20score(i, top_20[i]) for i in range(len(top_20))]
    print "score: %f" % np.mean(scores)
    return scores

In [4]:
pls_concat_1000 = PLSRegression(n_components=256)
pls_concat_1000.fit(des_concat_train, image_1000_train)



PLSRegression(copy=True, max_iter=500, n_components=256, scale=True,
       tol=1e-06)

In [6]:
pred_concat_1000 = pls_concat_1000.predict(des_concat_test)
top_20_concat_1000 = [get_nearest_20(vec, image_1000_test) for vec in pred_concat_1000]

In [12]:
def output_submission(top_20, output_path):
    top_20_image_IDs = map(lambda x: ' '.join(map(lambda xx: str(xx) + '.jpg', x)), top_20)
    description_ID = map(lambda x: str(x) + '.txt', range(len(top_20)))
    submission_df = pd.DataFrame({'Descritpion_ID': description_ID, 'Top_20_Image_IDs': top_20_image_IDs})
    submission_df.to_csv(output_path, index=False)


In [13]:
output_submission(top_20_concat_1000, 'submissions/PLSR_256_concat_1000.csv')

In [14]:
pls_concat_1000 = PLSRegression(n_components=256, max_iter=1000)
pls_concat_1000.fit(des_concat_train, image_1000_train)

PLSRegression(copy=True, max_iter=1000, n_components=256, scale=True,
       tol=1e-06)

In [15]:
pred_concat_1000 = pls_concat_1000.predict(des_concat_test)
top_20_concat_1000 = [get_nearest_20(vec, image_1000_test) for vec in pred_concat_1000]


In [16]:
output_submission(top_20_concat_1000, 'submissions/PLSR_256_concat_1000_1000iter.csv')

In [40]:
des_BOW_noun_train = np.load('features/descriptions_train_BOW_noun.npy')
des_BOW_noun_test = np.load('features/descriptions_test_BOW_noun.npy')

In [113]:
pls_BOW_noun_1000 = PLSRegression(n_components=10)
pls_BOW_noun_1000.fit(des_BOW_noun_train[:8000], image_1000_train[:8000])
print pls_BOW_noun_1000.score(des_BOW_noun_train[8000:], image_1000_train[8000:])

0.38326131392698193


In [49]:
scores = map20eval(pls_BOW_noun_1000, des_BOW_noun_train[8000:], image_1000_train[8000:])

2000
score: 0.290600


In [51]:
pls_BOW_noun_1000 = PLSRegression(n_components=256)
pls_BOW_noun_1000.fit(des_BOW_noun_train, image_1000_train)

PLSRegression(copy=True, max_iter=500, n_components=256, scale=True,
       tol=1e-06)

In [52]:
pred_BOW_noun_1000 = pls_BOW_noun_1000.predict(des_BOW_noun_test)
top_20_BOW_noun_1000 = [get_nearest_20(vec, image_1000_test) for vec in pred_BOW_noun_1000]

In [53]:
output_submission(top_20_BOW_noun_1000, 'submissions/PLSR_256_BOW_noun_1000.csv')

In [59]:
des_BOW_noun_train = np.load('features/descriptions_train_BOW_noun_5452.npy')
des_BOW_noun_test = np.load('features/descriptions_test_BOW_noun_5452.npy')

In [60]:
from sklearn.decomposition import PCA

In [89]:
pca = PCA(n_components=512)

In [90]:
pca.fit(des_BOW_noun_train)

PCA(copy=True, iterated_power='auto', n_components=512, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [91]:
des_BOW_noun_train_pca = pca.transform(des_BOW_noun_train)
des_BOW_noun_test_pca = pca.transform(des_BOW_noun_test)

In [92]:
print des_BOW_noun_train_pca.shape

(10000, 512)


In [111]:
pls_BOW_noun_1000 = PLSRegression(n_components=20)
pls_BOW_noun_1000.fit(des_BOW_noun_train_pca[:8000], image_2048_train[:8000])
print pls_BOW_noun_1000.score(des_BOW_noun_train_pca[8000:], image_2048_train[8000:])

0.2656926053530239


In [96]:
pls_BOW_noun_1000 = PLSRegression(n_components=256)
pls_BOW_noun_1000.fit(des_BOW_noun_train_pca, image_1000_train)

PLSRegression(copy=True, max_iter=500, n_components=256, scale=True,
       tol=1e-06)

In [97]:
pred_BOW_noun_1000_5452 = pls_BOW_noun_1000.predict(des_BOW_noun_test_pca)
top_20_BOW_noun_1000_5452 = [get_nearest_20(vec, image_1000_test) for vec in pred_BOW_noun_1000_5452]

In [99]:
output_submission(top_20_BOW_noun_1000_5452452, 'submissions/PLSR_256_BOW_noun_1000_5452_pca_512.csv')

In [100]:
tags_BOW_80_train = np.load('./features/tags_BOW_80_train.npy')
tags_BOW_80_test = np.load('./features/tags_BOW_80_test.npy')

In [106]:
pls_concat_tags = PLSRegression(n_components=400)
pls_concat_tags.fit(des_concat_train[:8000], tags_BOW_80_train[:8000])
print pls_concat_tags.score(des_concat_train[8000:], tags_BOW_80_train[8000:])

0.4628842195026251


In [107]:
scores = map20eval(pls_concat_tags, des_concat_train[8000:], tags_BOW_80_train[8000:])

2000
score: 0.331300


In [110]:
pls_long_tags = PLSRegression(n_components=400)
pls_long_tags.fit(des_long_train[:8000], tags_BOW_80_train[:8000])
print pls_long_tags.score(des_long_train[8000:], tags_BOW_80_train[8000:])

0.382746885744779


In [115]:
pls_BOW_noun_tags = PLSRegression(n_components=200)
pls_BOW_noun_tags.fit(des_BOW_noun_train[:8000], tags_BOW_80_train[:8000])
print pls_BOW_noun_tags.score(des_BOW_noun_train[8000:], tags_BOW_80_train[8000:])

0.3678109582281902


In [117]:
pls_BOW_noun_pca_tags = PLSRegression(n_components=200)
pls_BOW_noun_pca_tags.fit(des_BOW_noun_train_pca[:8000], tags_BOW_80_train[:8000])
print pls_BOW_noun_pca_tags.score(des_BOW_noun_train_pca[8000:], tags_BOW_80_train[8000:])

0.4992510740012771


In [118]:
pls_BOW_noun_pca_tags = PLSRegression(n_components=200)
pls_BOW_noun_pca_tags.fit(des_BOW_noun_train_pca, tags_BOW_80_train)

PLSRegression(copy=True, max_iter=500, n_components=200, scale=True,
       tol=1e-06)

In [119]:
pred_BOW_noun_pca_tags = pls_BOW_noun_pca_tags.predict(des_BOW_noun_test_pca)
top_20_BOW_noun_pca_tags = [get_nearest_20(vec, tags_BOW_80_test) for vec in pred_BOW_noun_pca_tags]

In [120]:
output_submission(top_20_BOW_noun_pca_tags, 'submissions/PLSR_200_top_20_BOW_noun_pca_512_tags.csv')