In [4]:
%load_ext autoreload
%autoreload 2
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import json
import pandas as pd
import os.path
from os.path import join, expanduser
from torch.utils.data import Dataset
from torch import nn
import random
import torch.optim
from fit_mteb_pytorch import *

# Load embeddings for corpus and queries
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
dset = MiniMarcoDataset()
dset.query_ids = dset.query_ids[:4000]
print('data has', len(dset), 'queries and', len(dset.corpus_ids), 'documents')
query_ids_train, query_ids_test = train_test_split(
    dset.query_ids, random_state=1, test_size=0.75)

# evaluate base embeddings
embs_qa, embs_tfidf, labels = dset[query_ids_test]
mrr, top1_frac, top3_frac, mrr_sem, top1_frac_sem, top3_frac_sem = evaluate_retrieval(
    embs_tfidf, dset.embs_tfidf_corpus_df.values,
    labels=labels, corpus_ids=dset.corpus_ids)
print(f'TF-IDF Test: {mrr=:.2f}\pm{mrr_sem:0.2f}, {top1_frac=:.2f}\pm{top1_frac_sem:0.2f},  {top3_frac=:.2f}\pm{top3_frac_sem:0.2f}')
mrr, top1_frac, top3_frac, mrr_sem, top1_frac_sem, top3_frac_sem = evaluate_retrieval(
    embs_qa, dset.embs_qa_corpus_df.values,
    labels=labels, corpus_ids=dset.corpus_ids)
print(f'QA Test: {mrr=:.2f}\pm{mrr_sem:0.2f}, {top1_frac=:.2f}\pm{top1_frac_sem:0.2f},  {top3_frac=:.2f}\pm{top3_frac_sem:0.2f}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
loading qa embeddings...
computing tf-idf embeddings...
data has 4000 queries and 5210 documents
TF-IDF Test: mrr=0.79\pm0.01, top1_frac=0.71\pm0.01,  top3_frac=0.86\pm0.01
QA Test: mrr=0.45\pm0.01, top1_frac=0.34\pm0.01,  top3_frac=0.50\pm0.01


In [2]:
class LinearMapping(nn.Module):
    def __init__(self, n_features, tfidf_size=None):
        super(LinearMapping, self).__init__()
        if tfidf_size is not None:
            n_features = n_features - tfidf_size
        self.linear = nn.Linear(n_features, n_features)
        self.linear.weight.data = torch.eye(n_features, n_features)
        self.tfidf_size = tfidf_size

    def forward(self, x):
        if self.tfidf_size is not None:
            return torch.hstack((self.linear(x[:, :-self.tfidf_size]), x[:, -self.tfidf_size:]))
        return self.linear(x)


class RescaleMapping(nn.Module):
    def __init__(self, n_features, tfidf_size=None):
        super(RescaleMapping, self).__init__()
        if tfidf_size is not None:
            n_features = n_features - tfidf_size

        # elementwise multiply input by vector instead
        self.linear = nn.Parameter(torch.ones(n_features))
        self.linear.data = 1e-8 * self.linear.data
        self.tfidf_size = tfidf_size

    def forward(self, x):
        # elementwise multiply input by vector instead
        if self.tfidf_size is not None:
            return torch.hstack((x[:, :-self.tfidf_size] * self.linear, x[:, -self.tfidf_size:]))
        return x * self.linear


# set args
device = 'cuda'
use_tfidf = True
num_dissimilar_examples = 5
# lr = 1e-1
mapping = RescaleMapping
# mapping = LinearMapping
# note, need to consider what we are initializing linear models to (sometimes close to 0 is preferred, sometimes close to 1)
# lr = 5e-3
lr = 1e-4

# get data
embs_qa, embs_tfidf, labels = dset[query_ids_train]
embs_qa_similar = np.vstack(
    [dset.embs_qa_corpus_df.loc[lab[0]].values for lab in labels])
embs_gt = dset.embs_qa_corpus_df.values
embs_qa_test, embs_tfidf_test, labels_test = dset[query_ids_test]

if use_tfidf:
    embs_tfidf_similar = np.vstack(
        [dset.embs_tfidf_corpus_df.loc[lab[0]].values for lab in labels])
    embs_tfidf_gt = dset.embs_tfidf_corpus_df.values

    # concatenate embs with tfidf
    embs_qa = np.hstack([embs_qa, embs_tfidf])
    embs_qa_similar = np.hstack([embs_qa_similar, embs_tfidf_similar])
    embs_qa_test = np.hstack([embs_qa_test, embs_tfidf_test])
    embs_gt = np.hstack([embs_gt, embs_tfidf_gt])
    tfidf_size = embs_tfidf.shape[1]
else:
    tfidf_size = None


# put all data on GPU
def _tensor(x):
    return torch.tensor(x, dtype=torch.float).to(device)


embs_qa = _tensor(embs_qa)
embs_qa_similar = _tensor(embs_qa_similar)
embs_qa_test = _tensor(embs_qa_test)
embs_gt = _tensor(embs_gt)


def get_dissimilar_examples(num_dissimilar_examples=25, use_tfidf=False):
    embs_qa_dissimilar = np.vstack([
        dset.embs_qa_corpus_df.loc[dset.get_random_neg_corpus_id(q)].values
        for _ in range(num_dissimilar_examples)
        for q in query_ids_train
    ])
    if use_tfidf:
        embs_tfidf_dissimilar = np.vstack([
            dset.embs_tfidf_corpus_df.loc[dset.get_random_neg_corpus_id(
                q)].values
            for _ in range(num_dissimilar_examples)
            for q in query_ids_train
        ])
        embs_qa_dissimilar = np.hstack(
            [embs_qa_dissimilar, embs_tfidf_dissimilar])
    return _tensor(embs_qa_dissimilar)


# set random seed
torch.manual_seed(1)
np.random.seed(1)
random.seed(1)
model = mapping(
    embs_qa.shape[1],
    tfidf_size=tfidf_size).to('cuda')
model_corpus = mapping(
    embs_qa.shape[1],
    tfidf_size=tfidf_size).to('cuda')
criterion = nn.MSELoss()
# criterion = nn.CosineEmbeddingLoss()
optimizer = torch.optim.AdamW(
    list(model.parameters()) + list(model_corpus.parameters()), lr=lr)
for epoch in range(100):
    model.eval()
    # evaluate train
    output = model(embs_qa)
    mrr, mtop1 = evaluate_retrieval(
        output.cpu().detach().numpy(),
        model_corpus(embs_gt).cpu().detach().numpy(),
        labels=labels, corpus_ids=dset.corpus_ids)
    print(f'\tQA Train: {mrr=:.3f}, {mtop1=:.3f}')

    # evaluate test
    output = model(embs_qa_test)
    mrr, mtop1 = evaluate_retrieval(
        output.cpu().detach().numpy(),
        model_corpus(embs_gt).cpu().detach().numpy(),
        labels=labels_test, corpus_ids=dset.corpus_ids)
    print(f'\tQA Test: {mrr=:.3f}, {mtop1=:.3f}')

    # sample batch of dissimilar examples
    embs_qa_dissimilar = get_dissimilar_examples(
        num_dissimilar_examples, use_tfidf)

    model.train()
    optimizer.zero_grad()
    output = model(embs_qa)
    loss = criterion(output, model_corpus(embs_qa_similar)) - 0.1 * \
        criterion(torch.vstack(
            [output] * num_dissimilar_examples), model_corpus(embs_qa_dissimilar))
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch} Loss {loss.item():0.3e}')

ValueError: too many values to unpack (expected 2)