This notebooks reproduces parts of our experimental setup. 

### Set the root directory where data will be stored

In [None]:
DATA_ROOT = '/tmp/mips-storage'

### Download the data

You can get sift dataset from [here](http://corpus-texmex.irisa.fr/). Look for `ANN_SIFT1M` in `Details and Download`. Alternatively, you can use the command below

In [None]:
!wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz -P $DATA_ROOT
!tar -xvf $DATA_ROOT/sift.tar.gz -C $DATA_ROOT

You can get wiki dataset from [here](http://manikvarma.org/downloads/XC/XMLRepository.html). Look for `WikiLSHTC-325K`

[Here](https://drive.google.com/file/d/0B3lPMIHmG6vGSHE1SWx4TVRva3c/view)'s the exact link to google drive. Please download it to the `${DATA_ROOT}` directory


In [None]:
!unzip $DATA_ROOT/WikiLSHTC.zip -d $DATA_ROOT

### Rename the files

In [None]:
import os
from pathlib import Path

In [None]:
ROOT = Path(DATA_ROOT)

WIKI = ROOT / 'WikiLSHTC'
SIFT = ROOT / 'sift'

In [None]:
os.rename(WIKI / 'wikiLSHTC_train.txt', WIKI / 'train.txt')
os.rename(WIKI / 'wikiLSHTC_test.txt',  WIKI / 'test.txt')

### Generate a ground-truth according to inner-product & rename SIFT data

 (sift ground truth is originally for L2)

In [None]:
from pymips.utils.data import generate_gt

In [None]:
generate_gt(data=SIFT, path=SIFT, skip_tests=True)

In [None]:
os.rename(SIFT / 'sift_base.fvecs',   SIFT / 'data.base.fvecs' )
os.rename(SIFT / 'sift_learn.fvecs',  SIFT / 'data.learn.fvecs' )
os.rename(SIFT / 'sift_query.fvecs',  SIFT / 'data.query.fvecs' )

### Generate dataset in fasttext format

In [None]:
from pymips.utils.data import prepare_ft

In [None]:
prepare_ft(WIKI, WIKI, force=True)

### Train a simple fasttext model on this data

In [None]:
from pymips.utils.data import _fasttext_cmd
import subprocess

In [None]:
%%bash 

export CPATH=$(pwd)/../../
export FAISS=$(pwd)/../../faiss/libfaiss.a

cd /tmp
git clone https://github.com/elanmart/fastText

cd fastText
    cp ${FAISS} .
    make -j 8

In [None]:
path = WIKI
fasttext = '/tmp/fastText/fasttext'

train_cmd = _fasttext_cmd(fasttext, 'supervised',
                          input=os.path.join(path, 'train.ft.txt'),
                          output=os.path.join(path, 'model.ft'),
                          minCount=3,
                          minCountLabel=3,
                          lr=0.1,
                          lrUpdateRate=100,
                          dim=256,
                          ws=5,
                          epoch=25,
                          neg=25,
                          loss='ns',
                          thread=8,
                          saveOutput=1)

generate_cmd = _fasttext_cmd(fasttext, 'to-fvecs',
                             os.path.join(path, 'model.ft.bin'),
                             os.path.join(path, 'test.ft.txt'),
                             os.path.join(path, 'data'))

subprocess.call(train_cmd)
subprocess.call(generate_cmd)

# Test the performance using some indexes

In [None]:
from pymips.index import FlatIndex, IVFIndex, KMeansIndex
from pymips.utils.data import load_sift, load_GT
from pymips.utils.evaluation import evalaute

In [None]:
def _load(path):
    xb = load_sift(os.path.join(path, 'data.wo.fvecs'))
    xq = load_sift(os.path.join(path, 'data.hid.fvecs'))
    G  = load_GT(os.path.join(path, 'data.labels.txt'))

    return xb, xq, G


xb, xq, G = _load(WIKI)

In [None]:
idx = FlatIndex(xb.shape[1])
idx.train(xb)
idx.add(xb)

%time (D, I) = idx.search(xq, 1)
evalaute(D, I, G)

In [None]:
idx = IVFIndex(xb.shape[1], 4096, 64)
idx.train(xb)
idx.add(xb)

%time (D, I) = idx.search(xq, 1)
evalaute(D, I, G)

In [None]:
idx = KMeansIndex(xb.shape[1], 2, 64, 3, 0.85, False, False)
idx.train(xb)
idx.add(xb)

%time (D, I) = idx.search(xq, 1, 64)
evalaute(D, I, G)