In [1]:
# setup:
# conda create -n pyserini python=3.10 -y
# conda activate pyserini
# conda install wget -y
# conda install -c conda-forge openjdk=21 maven -y
# conda install -c conda-forge lightgbm nmslib -y
# conda install -c pytorch faiss-cpu pytorch -y
# 
# pip install pyserini
# pip install ipykernel
# pip install watermark

# Source: https://github.com/castorini/pyserini/blob/master/docs/installation.md#mac

In [2]:
%load_ext watermark

In [3]:
%watermark -v -n -m -p torch,faiss,pyserini

Python implementation: CPython
Python version       : 3.10.14
IPython version      : 8.24.0

torch   : 2.2.0
faiss   : 1.8.0
pyserini: 0.35.0

Compiler    : GCC 12.3.0
OS          : Linux
Release     : 5.15.0-102-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 16
Architecture: 64bit



In [4]:
!mkdir -p prueba_pyserini_tmp/corpus
!mkdir -p prueba_pyserini_tmp/index

In [5]:
# Create a .json file with lines like:
# {"id": "...", "contents": "..."}
# with random contents

symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"

import random
import json

random.seed(33)

with open("prueba_pyserini_tmp/corpus/random_corpus.jsonl", "w") as f:
    for i in range(1000):
        id = str(i)
        contents = " ".join(random.choices(symbols, k=10)) + "\n\n"
        f.write(json.dumps({"id": id, "contents": contents}) + "\n")

In [7]:
# Run once:
# !python -m pyserini.encode \
#   input   --corpus prueba_pyserini_tmp/corpus/random_corpus.jsonl \
#           --fields text \
#           --delimiter "\n\n" \
#   output  --embeddings prueba_pyserini_tmp/embeddings \
#           --to-faiss \
#   encoder --encoder intfloat/multilingual-e5-small \
#           --fields text \
#           --batch 32 \
#           --device cpu \
#           --dimension 384 --l2-norm --pooling mean --prefix "passage:"

1000it [00:00, 349846.03it/s]
100%|███████████████████████████████████████████| 32/32 [04:29<00:00,  8.43s/it]


In [8]:
from pyserini.search import FaissSearcher

searcher = FaissSearcher(
    'prueba_pyserini_tmp/embeddings',
    'intfloat/multilingual-e5-small'
)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
queries = [
    'what is a lobster roll',
    'e a pe pe',
    'what is a lobster roll',
    'e a pe pe',
]
for q in queries:
    hits = searcher.search(q, k=500)
    print(q)
    for i in range(0, 500):
        print(f'{i+1:2} {hits[i].docid:7} {hits[i].score}')

what is a lobster roll
 1 111     2.9547996520996094
 2 370     2.9228010177612305
 3 741     2.920724868774414
 4 265     2.9184603691101074
 5 503     2.914444923400879
 6 844     2.9046671390533447
 7 86      2.903823137283325
 8 691     2.9036409854888916
 9 156     2.902879476547241
10 984     2.901808977127075
11 277     2.9009313583374023
12 704     2.898404836654663
13 17      2.897709369659424
14 262     2.8973302841186523
15 162     2.896120071411133
16 5       2.8947858810424805
17 853     2.8933658599853516
18 446     2.892181158065796
19 456     2.892092704772949
20 36      2.8916773796081543
21 893     2.889481544494629
22 391     2.8894009590148926
23 732     2.887972831726074
24 442     2.8859939575195312
25 632     2.8859329223632812
26 374     2.885467529296875
27 997     2.884945869445801
28 656     2.8848772048950195
29 284     2.8848655223846436
30 26      2.882950782775879
31 0       2.8823044300079346
32 340     2.882197618484497
33 161     2.8812599182128906
34 

In [None]:
hits = searcher.search(q)
print(q)
for i in range(0, 5):
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score}')