In [1]:
from load_data_json import LoadDataset 
from preprocess import VectorSpaceModel
import numpy as np
import time
from sklearn.decomposition import TruncatedSVD

## Load the dataset
Functions to load the dataset and the given queries with associated relevant documents.

In [2]:
dataset = LoadDataset("./data/med/med.json", "./data/med/queries.json", "./data/med/qrels-treceval.txt")

In [3]:
m,n = dataset.doc_matrix.shape
dataset.doc_matrix.nnz/(m*n)

0.004691013235037071

In [4]:
epsilon = np.finfo(np.float64).eps
sqrt_epsilon = np.sqrt(epsilon)
v = np.random.rand(n)
v = v / np.linalg.norm(v)

In [5]:
def preprocess(k, A, v):
    m, n = A.shape
    q = [np.zeros(n) for i in range(k)]
    beta = 0
    alpha = 0
    q[0] = v
    memo = {}
    for i in range(k-1):
        q_hat = A.dot(q[i])
        w = A.T.dot(q_hat) - beta * q[i-1]
        alpha = w.dot(q[i])
        w = w - alpha * q[i]
        for j in range(i):
            w_dotq = w @ q[j]
            w = w - w_dotq * q[j]
        beta = np.linalg.norm(w)
        if beta == 0:
            break
        q[i+1] = w / beta
    return q

In [6]:
q = preprocess(300, dataset.doc_matrix, v)

In [7]:
def response(q, query, A):
    k = len(q)
    s_hat = A.T.dot(query)
    m, n = A.shape
    s = np.zeros(n) 
    for i in range(k-1):
        q_dot_query = q[i] @ s_hat
        s = s + q_dot_query * q[i]

    return s



In [8]:
query = dataset.query_vectors[:,0]
similaries = response(q, query, dataset.doc_matrix)

In [9]:
similaries

array([ 0.00177882,  0.00255529,  0.00422955, ..., -0.00545442,
       -0.00481922,  0.0079013 ])

# Vector Space Model ranked retrieval

Create the `vsm()` object.

In [10]:
vsm = VectorSpaceModel(dataset.doc_matrix)
m, n = vsm.A.shape

In [11]:
vsm.preprocess(300)

44551


In [12]:
epsilon = np.finfo(np.float64).eps
epsilon = np.sqrt(epsilon)
epsilon

1.4901161193847656e-08

In [36]:
count = 0
for i in range(300):
    x = vsm.lanczos_vectors[i] @ vsm.lanczos_vectors[i]
    print(x)

1.0
1.0
1.0
0.9999999999999999
0.9999999999999998
1.0
1.0
0.9999999999999999
1.0
0.9999999999999997
1.0
1.0000000000000002
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0000000000000002
1.0
1.0
1.0
1.0
1.0
1.0000000000000002
1.0
1.0
1.0
1.0
0.9999999999999998
1.0
1.0
0.9999999999999999
1.0
1.0
1.0
1.0
1.0000000000000002
0.9999999999999999
0.9999999999999998
0.9999999999999997
1.0
0.9999999999999999
1.0
0.9999999999999998
0.9999999999999999
1.0
0.9999999999999998
0.9999999999999999
0.9999999999999998
1.0
0.9999999999999998
1.0
1.0000000000000002
0.9999999999999999
0.9999999999999999
1.0000000000000002
1.0000000000000002
1.0
1.0
1.0
0.9999999999999998
1.0
0.9999999999999999
1.0000000000000002
1.0
1.0
1.0
0.9999999999999999
0.9999999999999999
1.0
0.9999999999999999
1.0
1.0
1.0
0.9999999999999998
1.0
1.0
1.0000000000000002
1.0
1.0
0.9999999999999999
1.0000000000000002
1.0
1.0
1.0000000000000002
0.9999999999999998
1.0
1.0
1.0000000000000002
1.0
1.0
0.9999999999999999
1.0
0.9

In [15]:
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)

In [16]:
Ak = svd.fit_transform(vsm.A)

In [17]:
svd.components_.shape

(50, 1033)

In [18]:
dataset.query_vectors[:,0].T.dot(Ak).dot(np.diag(svd.singular_values_)).dot(svd.components_)

array([[0.00660732, 0.01935342, 0.00466614, ..., 0.00988035, 0.00471508,
        0.03267004]])

In [19]:
epsilon = np.finfo(np.float64).eps
epsilon

2.220446049250313e-16

In [20]:
vsm.A.shape

(13004, 1033)

In [21]:
vsm.response(dataset.query_vectors[:,0])

In [22]:
vsm.norms

In [23]:
u, s, v, lsi_norm = vsm.lsi_preprocess(50)

In [24]:
u

array([[-0.01029994,  0.00024417, -0.00393161, ...,  0.01098249,
        -0.0126084 ,  0.01114007],
       [-0.00104717,  0.00155091, -0.00216838, ...,  0.00034772,
        -0.00027169,  0.00053776],
       [ 0.00320095, -0.00047048,  0.00066028, ...,  0.00015159,
         0.00011783,  0.00050536],
       ...,
       [-0.00530743,  0.00359512,  0.00454236, ..., -0.00017045,
         0.00029722,  0.00067153],
       [ 0.00090966, -0.00035695, -0.00091288, ...,  0.00123334,
        -0.00037175,  0.00064278],
       [-0.00056484, -0.00134659, -0.00144216, ...,  0.00695932,
        -0.0039723 ,  0.00226147]])

In [25]:
u = u[:,::-1]; s = s[::-1]; v = v[::-1,:]

In [26]:
u.dot(np.diag(s)).dot(v) - vsm.A

matrix([[-3.87227049e-03, -9.77839057e-04, -7.85574710e-04, ...,
          1.07582933e-03,  1.15551230e-03,  2.42526126e-03],
        [-9.02233837e-04, -1.45434447e-03, -5.13185673e-04, ...,
         -2.75960154e-04,  1.69251330e-06,  3.67502786e-04],
        [-9.38994440e-05,  6.91317048e-05, -5.97949878e-04, ...,
         -1.15224932e-04,  4.47292738e-06,  1.18305983e-04],
        ...,
        [ 3.35008462e-04,  8.04557320e-04, -1.23284568e-04, ...,
         -5.98768755e-05,  1.39951350e-04,  8.92004405e-05],
        [ 1.55669149e-04, -6.79121558e-04, -4.75231695e-05, ...,
         -6.19752470e-05, -7.51170537e-05,  3.44085287e-05],
        [ 1.13989658e-03, -2.74357421e-04,  3.42133986e-04, ...,
          3.44649120e-04,  3.08890774e-04,  3.89212740e-05]])

In [27]:
vsm.A

<13004x1033 sparse matrix of type '<class 'numpy.float64'>'
	with 63015 stored elements in Compressed Sparse Column format>

In [28]:
vsm.scores

array([-0.00916986,  0.00413772,  0.00310898, ..., -0.00846745,
       -0.00077781, -0.00508246])

In [29]:
res = vsm.lsi_response(u, s, v, dataset.query_vectors[:,0])
res 

array([[ 0.00239841, -0.00077874, -0.00145914, ..., -0.00012103,
        -0.00115709,  0.00735473]])

In [30]:
epsilon = np.finfo(np.float64).eps
epsilon

2.220446049250313e-16

In [31]:
scores = vsm.scores[::-1]


In [32]:
np.linalg.norm(scores - res)

0.9637410544389273

In [33]:
res/lsi_norm

array([[ 0.00418371, -0.00199911, -0.00483557, ..., -0.00043685,
        -0.00765655,  0.0217793 ]])

In [34]:
vsm.scores/np.sqrt(vsm.norms)

TypeError: loop of ufunc does not support argument 0 of type NoneType which has no callable sqrt method

In [None]:
np.sqrt(8.30)

2.8809720581775866