In [1]:
from inference import SparseModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SparseModel()

In [4]:
QDRANT_URL='http://localhost:6333'
QDRANT_API_KEY=''
COLLECTION_NAME="quora"

In [5]:
from qdrant_client import QdrantClient, models
import math



In [6]:
def rescore_vector(vector: dict) -> dict:
    new_vector = {}

    sorted_vector = sorted(vector.items(), key=lambda x: x[1], reverse=True)

    for num, (idx, _value) in enumerate(sorted_vector):
        new_vector[idx] = 1. # math.log(1./(num + 1) + 1.) # * value
    return new_vector


In [7]:
def conver_sparse_vector(sparse_vector: dict) -> models.SparseVector:
    indices = []
    values = []

    for (idx, value) in sparse_vector.items():
        indices.append(int(idx))
        values.append(value)

    return models.SparseVector(
        indices=indices,
        values=values
    )

In [9]:
client = QdrantClient(QDRANT_URL, api_key=QDRANT_API_KEY)



In [21]:
query = "Should Indian Railways be privatized?"
sparse_vector = rescore_vector(next(model.encode([query])))
sparse_vector_qdrant = conver_sparse_vector(sparse_vector)

limit = 10
result = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=models.NamedSparseVector(
        name="attention",
        vector=sparse_vector_qdrant
    ),
    with_payload=True,
    with_vectors=True,
    limit=limit
)

In [22]:
for idx, hit in enumerate(result):
    print(idx, hit)

0 id=406675 version=19064 score=14.264926 payload={'id': '418691', 'text': 'Why is the Indian government not privatising Indian Railways?'} vector={'attention': SparseVector(indices=[1029, 1996, 2003, 2025, 2231, 2339, 2796, 7111, 7741, 20203, 26927], values=[0.0005044635, 0.35384217, 0.4080243, 1.3043731, 2.009453, 0.83495766, 1.9742125, 4.6289215, 3.1114693, 4.0357103, 3.6255777])} shard_key=None
1 id=277024 version=12986 score=13.902832 payload={'id': '285469', 'text': 'Why is Indian government not introducing privatisation in railways?'} vector={'attention': SparseVector(indices=[1029, 1999, 2003, 2025, 2231, 2339, 2796, 7111, 10449, 20203, 26652, 26927], values=[0.0005044635, 0.5242223, 0.4080243, 1.3043731, 2.009453, 0.83495766, 1.6121174, 4.6289215, 3.4818954, 4.0357103, 2.9424112, 3.6255777])} shard_key=None
2 id=277025 version=12986 score=13.902832 payload={'id': '285470', 'text': 'Why is the Indian government not allowing privatisation in railways?'} vector={'attention': Spar

In [16]:
for (idx, value) in zip(sparse_vector_qdrant.indices, sparse_vector_qdrant.values):
    print(f"{model.invert_vocab[idx]}: {value}")

railways: 1.0
##vati: 1.0
indian: 1.0
pri: 1.0
?: 1.0
should: 1.0
be: 1.0
##zed: 1.0


In [23]:
result_sparse_vector = result[9].vector['attention']

for (idx, value) in zip(result_sparse_vector.indices, result_sparse_vector.values):
    if idx in sparse_vector:
        print(f"{model.invert_vocab[idx]}: {value} x {sparse_vector[idx]} = {value * sparse_vector[idx]}")
    else:
        print(f"{model.invert_vocab[idx]}: {value}")


?: 0.0005044635 x 1.0 = 0.0005044635
to: 0.5377678
it: 0.873277
when: 1.228311
what: 0.3427238
got: 2.0297878
happened: 2.1821659
employees: 2.1019368
##zed: 3.0411465 x 1.0 = 3.0411465
vs: 3.703
##vati: 3.295511 x 1.0 = 3.295511
##nl: 4.712006
pri: 2.9606016 x 1.0 = 2.9606016


In [24]:
query = "Should Indian Railways be privatized?"
sparse_vector = rescore_vector(next(model.encode([query])))
sparse_vector_qdrant = conver_sparse_vector(sparse_vector)

limit = 10
result = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=models.NamedSparseVector(
        name="attention",
        vector=sparse_vector_qdrant
    ),
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="id", match=models.MatchValue(value="244307")
            ),
        ],
    ),
    with_payload=True,
    with_vectors=True,
    limit=limit
)

result_sparse_vector = result[0].vector['attention']

for (idx, value) in zip(result_sparse_vector.indices, result_sparse_vector.values):
    if idx in sparse_vector:
        print(f"{model.invert_vocab[idx]}: {value} x {sparse_vector[idx]} = {value * sparse_vector[idx]}")
    else:
        print(f"{model.invert_vocab[idx]}: {value}")


?: 0.0005044635 x 1.0 = 0.0005044635
can: 0.7317368
through: 1.8962183
indian: 1.6121174 x 1.0 = 1.6121174
develop: 2.3311443
railways: 4.6289215 x 1.0 = 4.6289215
acute: 4.6587257
privatization: 5.121674


In [40]:
import math

In [52]:
math.log(1/20 + 1)

0.04879016416943205

In [112]:
a = [1,3,2]

sorted(a, reverse=True)

a

[1, 3, 2]