In [1]:
from load_data_json import LoadDataset 
from preprocess import VectorSpaceModel
import numpy as np
import time
from sklearn.decomposition import TruncatedSVD

## Load the dataset
Functions to load the dataset and the given queries with associated relevant documents.

In [2]:
dataset = LoadDataset("./data/med/med.json", "./data/med/queries.json", "./data/med/qrels-treceval.txt")

In [3]:
m,n = dataset.doc_matrix.shape
dataset.doc_matrix.nnz/(m*n)

0.004691013235037071

In [4]:
epsilon = np.finfo(np.float64).eps
sqrt_epsilon = np.sqrt(epsilon)
v = np.random.rand(n)
v = v / np.linalg.norm(v)

In [5]:
def preprocess(k, A, v):
    m, n = A.shape
    q = [np.zeros(n) for i in range(k)]
    beta = 0
    alpha = 0
    q[0] = v
    memo = {}
    for i in range(k-1):
        q_hat = A.dot(q[i])
        w = A.T.dot(q_hat) - beta * q[i-1]
        alpha = w.dot(q[i])
        w = w - alpha * q[i]
        for j in range(i):
            w_dotq = w @ q[j]
            w = w - w_dotq * q[j]
        beta = np.linalg.norm(w)
        if beta == 0:
            break
        q[i+1] = w / beta
    return q

In [6]:
q = preprocess(300, dataset.doc_matrix, v)

In [7]:
def response(q, query, A):
    k = len(q)
    s_hat = A.T.dot(query)
    m, n = A.shape
    s = np.zeros(n) 
    for i in range(k-1):
        q_dot_query = q[i] @ s_hat
        s = s + q_dot_query * q[i]

    return s



In [8]:
query = dataset.query_vectors[:,0]
similaries = response(q, query, dataset.doc_matrix)

In [9]:
similaries

array([-0.00108914, -0.0059993 ,  0.00310291, ...,  0.00049616,
       -0.00689983,  0.0036008 ])

# Vector Space Model ranked retrieval

Create the `vsm()` object.

In [10]:
vsm = VectorSpaceModel(dataset.doc_matrix)
m, n = vsm.A.shape

In [11]:
vsm.preprocess(15)

In [12]:
epsilon = np.finfo(np.float64).eps
epsilon = np.sqrt(epsilon)
epsilon

1.4901161193847656e-08

In [13]:
count = 0
for i in range(5):
    x = vsm.lanczos_vectors[i] @ vsm.lanczos_vectors[i]
    print(x)

0.9999999999999998
1.0
1.0
1.0
1.0


In [14]:
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)

In [15]:
Ak = svd.fit_transform(vsm.A)

In [16]:
svd.components_.shape

(50, 1033)

In [17]:
dataset.query_vectors[:,0].T.dot(Ak).dot(np.diag(svd.singular_values_)).dot(svd.components_)

array([[0.00660732, 0.01935342, 0.00466614, ..., 0.00988035, 0.00471508,
        0.03267004]])

In [18]:
epsilon = np.finfo(np.float64).eps
epsilon

2.220446049250313e-16

In [19]:
vsm.A.shape

(13004, 1033)

In [20]:
vsm.response(dataset.query_vectors[:,0])

In [21]:
vsm.scores
scores = vsm.scores/vsm.norms
sorted_indices = np.argsort(-np.abs(scores))
sorted_indices[:37]

array([ 334,  343,  502,  163,  168,  118,  141,  504,  211,  210,  499,
        304,  510,  505,   10,    3,  605,  184,  234,  501,   12,  752,
         71,  506,  508,  503,  179,  130,  169,  185, 1021,  699,  509,
         84,  167,  845,    8], dtype=int64)

In [22]:
vsm.scores[212]

0.01286503021381786

In [23]:
sorted(sorted_indices[:37])

[3,
 8,
 10,
 12,
 71,
 84,
 118,
 130,
 141,
 163,
 167,
 168,
 169,
 179,
 184,
 185,
 210,
 211,
 234,
 304,
 334,
 343,
 499,
 501,
 502,
 503,
 504,
 505,
 506,
 508,
 509,
 510,
 605,
 699,
 752,
 845,
 1021]

In [31]:
x, y = vsm.implicit_qr_algorithm(vsm.alpha, vsm.beta)

In [32]:
x

array([17.97156031,  8.88264294,  8.41346356,  6.61313904,  6.16746504,
        5.54324641,  4.53699172,  3.65153703,  2.79189021,  1.93114489,
        1.29299669,  0.81363271,  0.46091313,  0.24703523, -0.67245925])

In [34]:
y[:,2] @ y[:,2]

1.0

In [27]:
y[:,2:]

array([[ 4.98455010e-02, -1.59833205e-02,  1.16789068e-01,
        -6.64724100e-02,  8.52691892e-02, -7.58729899e-02,
         1.01360026e-01, -1.63093460e-01,  2.40033929e-01,
        -3.11699932e-01,  2.89565161e-01, -1.34061762e-01,
         1.03300828e-04],
       [-2.61566050e-02,  1.19376345e-02, -9.32474715e-02,
         5.83845590e-02, -8.58924275e-02,  8.50389463e-02,
        -1.24773790e-01,  2.18761548e-01, -3.41598067e-01,
         4.62740033e-01, -4.42971135e-01,  2.08760352e-01,
        -1.73034679e-04],
       [-2.32402283e-01,  6.62237031e-02, -4.63166268e-01,
         2.42260214e-01, -2.56851555e-01,  1.77526176e-01,
        -1.60447786e-01,  1.17838740e-01, -4.39736500e-03,
        -1.70745944e-01,  2.85128234e-01, -1.68648441e-01,
         2.58792538e-04],
       [-1.68845004e-01, -8.26164850e-04,  8.75358643e-02,
        -1.13039347e-01,  2.40516499e-01, -2.47792801e-01,
         3.14227850e-01, -3.64883704e-01,  2.60467799e-01,
         3.01057570e-02, -3.33371136e

In [28]:
T @ y[:,0]

NameError: name 'T' is not defined

In [None]:
u, s, v, lsi_norm = vsm.lsi_preprocess(5)

In [None]:
s*s

array([ 6.20844665,  6.67575198,  8.40962395,  8.88648343, 17.97156031])

In [None]:
u = u[:,::-1]; s = s[::-1]; v = v[::-1,:]

In [None]:
u.dot(np.diag(s)).dot(v) - vsm.A

matrix([[-1.23851086e-03, -6.77642271e-03,  1.75600675e-03, ...,
         -2.73971600e-03,  3.23002952e-04,  4.62797551e-04],
        [ 3.24747405e-04,  1.86588899e-05,  6.66474952e-04, ...,
         -1.90873216e-04,  9.47459340e-04, -3.32363139e-04],
        [ 3.85245188e-04, -2.16897563e-03, -1.74493466e-03, ...,
         -8.78831343e-04, -1.85708090e-03,  8.11170417e-04],
        ...,
        [-2.48408938e-03, -7.39168914e-04, -7.38590473e-04, ...,
         -5.38463431e-04,  8.40047347e-04, -1.30317090e-03],
        [ 3.21195300e-04, -4.56321554e-04, -5.16076834e-04, ...,
         -1.35420452e-03, -6.13147852e-05,  3.53526526e-04],
        [ 1.47942354e-03, -1.32830014e-03,  6.10283709e-03, ...,
          1.65419214e-03, -2.33065922e-03, -3.41978820e-03]])

In [None]:
vsm.A

<13004x1033 sparse matrix of type '<class 'numpy.float64'>'
	with 63015 stored elements in Compressed Sparse Column format>

In [None]:
vsm.scores

array([0.00892056, 0.0121185 , 0.00450396, ..., 0.00237615, 0.00534256,
       0.00943086])

In [None]:
res = vsm.lsi_response(u, s, v, dataset.query_vectors[:,0])
res 

array([[-0.00069021, -0.00829348,  0.00223858, ..., -0.00834512,
         0.00210829,  0.01657744]])

In [None]:
epsilon = np.finfo(np.float64).eps
epsilon

2.220446049250313e-16

In [None]:
scores = vsm.scores[::-1]


In [None]:
np.linalg.norm(scores - res)

0.7528123726726272

In [None]:
res/lsi_norm

array([[-0.00082315, -0.01119094,  0.00291302, ..., -0.01268348,
         0.00344351,  0.02385357]])

In [None]:
vsm.scores/np.sqrt(vsm.norms)

array([0.06722867, 0.07689126, 0.03923905, ..., 0.01376622, 0.07265981,
       0.05847602])

In [None]:
np.sqrt(8.30)

2.8809720581775866