```
Alsh data: http://manikvarma.org/downloads/XC/XMLRepository.html 

renamed to train.txt and test.txt
```

In [None]:
cd ..

# Data

In [None]:
import sys
import os

sys.path.append(os.path.abspath('./faiss/'))
sys.path.append(os.path.abspath('./python/'))

from experiments.data import get_data
from misc.utils import to_ft, load_sift

In [13]:
X, Y, words_mask, labels_mask = get_data('./data/LSHTC', 'train', min_words=3, min_labels=3)
to_ft(X, Y, './data/LSHTC-FT/train.txt')

X, Y, *_ = get_data('./data/LSHTC', 'test', words_mask=words_mask, labels_mask=labels_mask)
to_ft(X, Y, './data/LSHTC-FT/test.txt')

2017-11-28 23:15:49,962 - root - INFO - Data not found or `force` flag was passed, I'm going to prepare it and store at ./data/LSHTC/X_train.csr.npz.
2017-11-28 23:21:12,478 - root - INFO - Data not found or `force` flag was passed, I'm going to prepare it and store at ./data/LSHTC/X_test.csr.npz.


In [24]:
def make_cmd(*args, **kwargs):
    args = ' '.join(args)
    opts = ' '.join(f'-{k} {v}' for k, v in kwargs.items())
    cmd  = f'./fastText/fasttext {args} {opts}'
    
    return cmd.split()

train_cmd = make_cmd('supervised', 
                     input         = './data/LSHTC-FT/train.txt',
                     output        = './data/LSHTC-FT/model.ft',
                     minCount      = 5,
                     minCountLabel = 5,
                     lr            = 0.1,
                     lrUpdateRate  = 100,
                     dim           = 256,
                     ws            = 5,
                     epoch         = 25,
                     neg           = 25,
                     loss          = 'ns',
                     thread        = 8,
                     saveOutput    = 1)

generate_cmd = make_cmd('to-fvecs',
                        './data/LSHTC-FT/model.ft.bin',
                        './data/LSHTC-FT/test.txt',
                        './data/LSHTC-FT/fvecs')


In [19]:
import subprocess

subprocess.call(train_cmd)
subprocess.call(generate_cmd)

0

0

# Tests

In [1]:
import sys
import os

sys.path.append(os.path.abspath('./faiss/'))
sys.path.append(os.path.abspath('./python/'))

import faiss
import mips

import numpy as np

from experiments.data import get_data
from misc.utils import to_ft, load_sift

import uuid
import time, datetime

from contextlib import contextmanager
import json

from tinydb import TinyDB, where

import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()

In [2]:
@contextmanager
def timer():
    class Clock:
        elapsed = 0
        
    t0 = time.time()
    
    yield Clock
    
    Clock.elapsed = time.time() - t0


def search(idx, data, k):
    D, I = idx.search(data, k)
    D, I = D.reshape(-1, k), I.reshape(-1, k)
    
    return D, I


def compute_p1(G, I):
    p1 = 0.
    for i, item in enumerate(I):
        p1 += float(int(item) in G[i])

    p1 /= len(G)
    
    return p1


def test_idx(IdxClass, params, xb, xq, G, k=100):
    
    try:

        idx = IdxClass(**params)

        with timer() as train_t:
            idx.train(xb)
            idx.add(xb)

        with timer() as search_t:
            _, I = search(idx, xq, k)

        p1 = compute_p1(G, I[:, 0])

        report = make_report(IdxClass, params, p1, train_t.elapsed, search_t.elapsed)
    
    except Exception as e:
        print('FAILED: ' + str(e))
        report = str(e)

    return report


def now():
    return datetime.datetime.fromtimestamp(time.time()).strftime("%d-%m-%y %H:%M:%S")


def make_report(IdxClass, params, p1, train_t, search_t):
    return {
        'ID': uuid.uuid4().hex,
        'algo': IdxClass.__name__,
        'params': params,
        'p1': p1,
        'train_t': train_t,
        'search_t': search_t
    }


def add_result(r):
    
    if isinstance(r, dict):
        algo, params, p1, t = r['algo'], r['params'], r['p1'], r['search_t']
        rep = f'(params={params}, p1={p1:.2f}, t={t:.2f})'
    else:
        rep = r
    
    logger.info(f'Adding: {rep}')
    
    def result_adder(doc):
        doc['results'].append(r)
        
    DB.update(result_adder, where('ID') == ID)


def test(IdxClass, **params):
    return test_idx(IdxClass, params, xb, xq, G, k=100)


In [3]:
# !rm ./data/results/ad-hoc-db.json
DB = TinyDB('./data/results/ad-hoc-db.json')
ID = uuid.uuid4().hex

info = dict(
    ID = ID,
    name = 'ad-hoc-results',
    date = now(),
    results = []
)

DB.insert(info)

1

In [4]:
LIMIT = 250_000

# LOAD --------------

xq = load_sift('./data/LSHTC-FT/fvecs.hid.fvecs', dtype=np.float32)
xb = load_sift('./data/LSHTC-FT/fvecs.wo.fvecs', dtype=np.float32)

_n, d, c = xq.shape[0], xq.shape[1], xb.shape[0]

# LIMIT --------------

inds = np.random.choice(np.arange(_n), LIMIT, replace=False)
xq   = xq[inds, :]

xq = np.copy(np.ascontiguousarray(xq), order='C')
xb = np.copy(np.ascontiguousarray(xb), order='C')

n = xq.shape[0]

# GT --------------

G = []
for line in open('./data/LSHTC-FT/fvecs.labels.txt'):
    G.append({int(y) for y in line.split()})
G = [G[idx] for idx in inds]

In [5]:
logger.info(f"Loaded dataset of {_n:_}, {d:_}-dimensionsl queries (examples), but limiting to {LIMIT:_} queries")
logger.info(f"The dataset contains {c:_} classes, and more than one class can be positive")

2017-11-29 03:35:45,205 - root - INFO - Loaded dataset of 576_246, 256-dimensionsl queries (examples), but limiting to 250_000 queries
2017-11-29 03:35:45,205 - root - INFO - The dataset contains 163_679 classes, and more than one class can be positive


# Proxies

In [6]:
class IVFIndex:
    def __init__(self, d, size, nprobe):
        self.index = faiss.index_factory(d, f"IVF{size},Flat", faiss.METRIC_INNER_PRODUCT)
        self.index.nprobe = nprobe

    def __getattr__(self, name):
        return getattr(self.index, name)
    
    
class KMeansIndex:
    def __init__(self, d, layers, nprobe, m, U):
        self.aug = mips.MipsAugmentationShrivastava(d, m, U)
        self.index = mips.IndexHierarchicKmeans(d, layers, nprobe, self.aug)

    def __getattr__(self, name):
        return getattr(self.index, name)

# Round 1

IVF

In [None]:
for size in [4096]:
    for nprobe in [1, 16, 32, 64, 128]:
        add_result(
            test(
                IVFIndex, d=d, size=size, nprobe=nprobe))

2017-11-29 03:36:18,262 - root - INFO - Adding: (params={'d': 256, 'size': 4096, 'nprobe': 1}, p1=0.20, t=2.22)
2017-11-29 03:36:41,395 - root - INFO - Adding: (params={'d': 256, 'size': 4096, 'nprobe': 16}, p1=0.26, t=8.90)
2017-11-29 03:37:14,259 - root - INFO - Adding: (params={'d': 256, 'size': 4096, 'nprobe': 32}, p1=0.26, t=17.66)
2017-11-29 03:38:04,275 - root - INFO - Adding: (params={'d': 256, 'size': 4096, 'nprobe': 64}, p1=0.26, t=35.05)
2017-11-29 03:39:23,187 - root - INFO - Adding: (params={'d': 256, 'size': 4096, 'nprobe': 128}, p1=0.26, t=63.53)


Kmeans

In [None]:
for layers in [2]:
    for nprobe in [1, 16, 32, 64, 128]:
        add_result(
            test(
                KMeansIndex, d=d, layers=layers, nprobe=nprobe, m=5, U=0.85))

2017-11-29 03:39:58,857 - root - INFO - Adding: (params={'d': 256, 'layers': 2, 'nprobe': 1, 'm': 5, 'U': 0.85}, p1=0.10, t=3.19)
2017-11-29 03:41:06,709 - root - INFO - Adding: (params={'d': 256, 'layers': 2, 'nprobe': 16, 'm': 5, 'U': 0.85}, p1=0.23, t=28.02)
2017-11-29 03:42:24,757 - root - INFO - Adding: (params={'d': 256, 'layers': 2, 'nprobe': 32, 'm': 5, 'U': 0.85}, p1=0.25, t=46.81)
2017-11-29 03:44:53,502 - root - INFO - Adding: (params={'d': 256, 'layers': 2, 'nprobe': 64, 'm': 5, 'U': 0.85}, p1=0.25, t=117.94)


# Round 2

IVF

In [None]:
for size in [4096]:
    for nprobe in [256, 512]:
        add_result(
            test(
                IVFIndex, d=d, size=size, nprobe=nprobe))

Kmeans

In [None]:
for layers in [2]:
    for nprobe in [256, 512]:
        add_result(
            test(
                KMeansIndex, d=d, layers=layers, nprobe=nprobe, m=5, U=0.85))

In [None]:
for layers in [3]:
    for nprobe in [1, 16, 32, 64, 128]:
        add_result(
            test(
                KMeansIndex, d=d, layers=layers, nprobe=nprobe, m=5, U=0.85))

# Baseline

In [None]:
add_result(
    test(
        faiss.IndexFlatIP, d=d))