In [9]:
from inference import SparseModel
from remap_tokens import weight_snowball_with_sparse


In [2]:
model = SparseModel()

In [11]:
QDRANT_URL='http://localhost:6333'
QDRANT_API_KEY=''
COLLECTION_NAME="fiqa"

In [12]:
from qdrant_client import QdrantClient, models
import math

In [30]:

from typing import List, Union
import mmh3
VOCAB_SIZE = 2_000_000

def token_to_idx(token: Union[str, int]) -> int:
    if isinstance(token, str):
        return mmh3.hash(token) % VOCAB_SIZE
    return token

def rescore_vector(vector: dict) -> dict:
    new_vector = {}

    sorted_vector = sorted(vector.items(), key=lambda x: x[1], reverse=True)

    for num, (token, _value) in enumerate(sorted_vector):
        idx = token_to_idx(token)
        new_vector[idx] = 1 # math.log(1./(num + 1) + 1.) # * value
    return new_vector



In [31]:
def conver_sparse_vector(sparse_vector: dict) -> models.SparseVector:
    indices = []
    values = []

    for (idx, value) in sparse_vector.items():
        indices.append(int(idx))
        values.append(value)

    return models.SparseVector(
        indices=indices,
        values=values
    )

In [32]:
client = QdrantClient(QDRANT_URL, api_key=QDRANT_API_KEY)



In [36]:
query = "Income in zero-interest environment"
# query = "Something looks off about Mitsubishi financial data"
sparse_vector = rescore_vector(next(iter(weight_snowball_with_sparse([query]))))
sparse_vector_qdrant = conver_sparse_vector(sparse_vector)

limit = 10
result = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=models.NamedSparseVector(
        name="attention",
        vector=sparse_vector_qdrant
    ),
    with_payload=True,
    with_vectors=True,
    limit=limit
)

In [37]:
sparse_vector_qdrant

SparseVector(indices=[245443, 536704, 381228, 423808], values=[1.0, 1.0, 1.0, 1.0])

In [38]:
for idx, hit in enumerate(result):
    print(idx, hit)

0 id=13339 version=626 score=2.2069366 payload={'id': '137225', 'text': "I've had zero taxable income for the past 2 years and yet the calculations say I owe the government $250 for each year for the Self Employment tax.  How can they charge a non-zero tax on my income when my taxable income is zero?  That is theft. That demands reform."} vector={'attention': SparseVector(indices=[105600, 169405, 252851, 257764, 273945, 381228, 423685, 454046, 526443, 536704, 606981, 745112, 745418, 846510, 1522071, 1534310, 1779546, 1804292, 1862685, 1875703], values=[0.80624396, 0.4640101, 0.5026966, 0.9190698, 0.24484365, 0.6620832, 0.2386803, 0.29232448, 0.2030843, 1.5448534, 0.19014518, 0.10017339, 0.1001474, 0.17525618, 0.13144773, 0.26051712, 0.29709387, 0.9886667, 0.28483647, 0.17031649])} shard_key=None
1 id=14990 version=704 score=2.0541573 payload={'id': '154373', 'text': 'The difference is the time when they are released and how much revenue they have recouped at that point.  The Shaw Cable

In [26]:
for (idx, value) in zip(sparse_vector_qdrant.indices, sparse_vector_qdrant.values):
    print(f"{model.invert_vocab[idx]}: {value}")

KeyError: 10245443

In [23]:
result_sparse_vector = result[9].vector['attention']

for (idx, value) in zip(result_sparse_vector.indices, result_sparse_vector.values):
    if idx in sparse_vector:
        print(f"{model.invert_vocab[idx]}: {value} x {sparse_vector[idx]} = {value * sparse_vector[idx]}")
    else:
        print(f"{model.invert_vocab[idx]}: {value}")


?: 0.0005044635 x 1.0 = 0.0005044635
to: 0.5377678
it: 0.873277
when: 1.228311
what: 0.3427238
got: 2.0297878
happened: 2.1821659
employees: 2.1019368
##zed: 3.0411465 x 1.0 = 3.0411465
vs: 3.703
##vati: 3.295511 x 1.0 = 3.295511
##nl: 4.712006
pri: 2.9606016 x 1.0 = 2.9606016


In [24]:
query = "Should Indian Railways be privatized?"
sparse_vector = rescore_vector(next(model.encode([query])))
sparse_vector_qdrant = conver_sparse_vector(sparse_vector)

limit = 10
result = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=models.NamedSparseVector(
        name="attention",
        vector=sparse_vector_qdrant
    ),
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="id", match=models.MatchValue(value="244307")
            ),
        ],
    ),
    with_payload=True,
    with_vectors=True,
    limit=limit
)

result_sparse_vector = result[0].vector['attention']

for (idx, value) in zip(result_sparse_vector.indices, result_sparse_vector.values):
    if idx in sparse_vector:
        print(f"{model.invert_vocab[idx]}: {value} x {sparse_vector[idx]} = {value * sparse_vector[idx]}")
    else:
        print(f"{model.invert_vocab[idx]}: {value}")


?: 0.0005044635 x 1.0 = 0.0005044635
can: 0.7317368
through: 1.8962183
indian: 1.6121174 x 1.0 = 1.6121174
develop: 2.3311443
railways: 4.6289215 x 1.0 = 4.6289215
acute: 4.6587257
privatization: 5.121674


In [40]:
import math

In [52]:
math.log(1/20 + 1)

0.04879016416943205

In [112]:
a = [1,3,2]

sorted(a, reverse=True)

a

[1, 3, 2]