In [4]:
import os
import pickle
import zlib
import time

import numpy as np
import torch
import psycopg2

from db.DocumentRepository import DocumentRepository
from ranker.Ranker import Ranker


In [None]:
os.system("""
    docker compose down;
    docker compose up -d --build db;
    sleep 15;
    """)

In [2]:
# Comment the initialization part in the Ranker __init__ out where you load idf and tf because it does not work when the tables are empty
ranker = Ranker()
documentRepository = ranker.documentRepository
tokenizer = documentRepository.tokenizer
all_docs = documentRepository.loadAllDocuments()

SC: Connected to the db. Now you can go and build the best search engine around!
Vocab size: 30522
Loading Documents first...
All documents loaded.
Now we load TF and IDF
TF and IDF loaded.
Loading trained model...
Trained model loaded.


In [5]:
# Warning: takes long to compute everything
start_time = time.time()
    
enc_texts = [doc.enc_text for doc in all_docs]
max_length = max(len(doc) for doc in enc_texts)
padded_docs = [torch.cat([doc, torch.tensor([-1] * (max_length - doc.size(0)))]) for doc in enc_texts]
tensor_docs = torch.stack(padded_docs)
tensor_docs = tensor_docs.to(torch.int)
print("Document prepared as tensor")
idf = ranker._compute_idfv2(tensor_docs)
print("IDF computed")

tf, doc_lengths, max_term = ranker._calculate_tf_and_lengthsv2(tensor_docs)
avg_doc_len = sum(doc_lengths) / len(doc_lengths)
print("TF computed")

end_time = time.time()
print("Works, Time required: {end_time - start_time} seconds")

Works, Time required: {end_time - start_time} seconds


1. Step: Save idf dictionary

In [6]:
# 1. Step: save idf
conn = documentRepository.connection
cur = documentRepository.cursor

for key, value in idf.items():
    cur.execute("INSERT INTO idfs (key, value) VALUES (%s, %s) ON CONFLICT (key) DO UPDATE SET value = %s", (key, value, value))

conn.commit()
print("IDF saved! :)")

IDF saved! :)


In [7]:
# 2. Step: load idf again
cur.execute("SELECT key, value FROM idfs")
rows = cur.fetchall()

# convert
loaded_dict = {key: value for key, value in rows}

print(loaded_dict)

{100: 2.7896793857626734, 999: 1.126886249240387, 1004: 0.21587257446707575, 1005: 1.0332336810136948, 1006: 0.022666678862438498, 1007: 0.022554853310183233, 1009: 0.38308700547869734, 1010: 0.00977761154020519, 1011: 0.0018602620673502676, 1012: 0.009336110429280977, 1013: 0.19479655439114887, 1014: 0.6808690885760847, 1015: 0.9236424959351806, 1016: 1.061272901078088, 1017: 1.0804122412887853, 1018: 1.2923467080682751, 1024: 0.025466389521484948, 1029: 0.929995231013576, 1030: 0.30967798549812514, 1037: 0.037992649340665326, 1038: 0.21397504069088424, 1039: 1.9472229158995706, 1040: 0.8591383916894233, 1041: 0.21316291287147776, 1042: 1.1326382529186698, 1045: 0.35525472192692087, 1047: 0.6235786940051312, 1049: 1.1727959505444647, 1050: 1.9831082016120174, 1052: 0.8288831285931432, 1054: 1.7953737330605917, 1055: 0.7606418542855038, 1056: 1.5192792757811986, 1061: 2.6411366737297115, 1062: 0.23449559659078026, 1064: 0.125768260223051, 1075: 0.08996752957761722, 1516: 1.015873720151

In [None]:
# documentRepository = DocumentRepository()
# conn = documentRepository.connection
# cur = documentRepository.cursor

In [8]:
# 3. Step: Save tf
print(type(tf))

n = tf.size(dim=0)
m = tf.size(dim=1)

# serialize tf tensor
tf_numpy = tf.numpy()
serialized_data = pickle.dumps(tf_numpy)
compressed_data = zlib.compress(serialized_data)

# Chunk the compressed data
chunk_size = 1024 * 1024  # 1 MB
chunks = [compressed_data[i:i + chunk_size] for i in range(0, len(compressed_data), chunk_size)]

# Insert data into the table
for chunk_id, chunk in enumerate(chunks):
    cur.execute(
        'INSERT INTO tfs (chunk_id, data) VALUES (%s, %s)',
        (chunk_id, psycopg2.Binary(chunk))
    )
conn.commit()
print("Inserted tf :)")

<class 'torch.Tensor'>
Inserted tf :)


In [9]:
cur.execute('SELECT data FROM tfs ORDER BY chunk_id')
chunks = cur.fetchall()
compressed_data = b''.join([chunk[0] for chunk in chunks])

# Decompress and deserialize the data
serialized_data = zlib.decompress(compressed_data)
numpy_array_restored = pickle.loads(serialized_data)

# Convert the NumPy array back to a PyTorch tensor
tensor_restored = torch.tensor(numpy_array_restored, dtype=torch.int32)
print(tensor_restored.shape)

torch.Size([9147, 30522])


In [10]:
# Step 5: Save tf metadata
print(type(doc_lengths), type(max_term), type(avg_doc_len))

cur.execute(
    'INSERT INTO tf_meta (doc_lengths, max_term, avg_doc_len) VALUES (%s, %s, %s)',
    (doc_lengths, max_term, avg_doc_len)
)

conn.commit()
print("Inserted tf, doc_lengths, max_term, avg_doc_len :)")

<class 'list'> <class 'int'> <class 'float'>
Inserted tf, doc_lengths, max_term, avg_doc_len :)


In [11]:
# 6: Reconstruct tf metadata

cur.execute('SELECT doc_lengths, max_term, avg_doc_len FROM tf_meta WHERE id = 1')  # Assuming you want the first row
meta_data = cur.fetchone()
doc_lengths = meta_data[0]
max_term = meta_data[1]
avg_doc_len = meta_data[2]
print(doc_lengths)
print(max_term)
print(avg_doc_len)

[Decimal('2837'), Decimal('1583'), Decimal('3001'), Decimal('3989'), Decimal('2111'), Decimal('2599'), Decimal('1332'), Decimal('2714'), Decimal('1448'), Decimal('2006'), Decimal('529'), Decimal('4864'), Decimal('1924'), Decimal('2513'), Decimal('897'), Decimal('569'), Decimal('1369'), Decimal('1868'), Decimal('977'), Decimal('2242'), Decimal('944'), Decimal('3042'), Decimal('687'), Decimal('895'), Decimal('825'), Decimal('2371'), Decimal('2067'), Decimal('2261'), Decimal('908'), Decimal('2477'), Decimal('1568'), Decimal('2500'), Decimal('3119'), Decimal('838'), Decimal('1123'), Decimal('1384'), Decimal('804'), Decimal('1690'), Decimal('732'), Decimal('1778'), Decimal('771'), Decimal('1822'), Decimal('2168'), Decimal('870'), Decimal('1238'), Decimal('537'), Decimal('876'), Decimal('886'), Decimal('711'), Decimal('1672'), Decimal('1160'), Decimal('931'), Decimal('953'), Decimal('205'), Decimal('578'), Decimal('934'), Decimal('857'), Decimal('1270'), Decimal('4094'), Decimal('1834'), Dec

Nice, we were able to compute tf and idf offline just based on our index. That is necessary to make BM25 faster.

In [12]:
documentRepository.overwrite_dump()

SC: Successfully overwritten the old dump. Now you only need to push it to the repository!
