In [1]:
from load_data_json import LoadDataset 
from preprocess import VectorSpaceModel
import numpy as np
import time
from sklearn.decomposition import TruncatedSVD

## Load the dataset
Functions to load the dataset and the given queries with associated relevant documents.

In [2]:
dataset = LoadDataset("./data/med/med.json", "./data/med/queries.json", "./data/med/qrels-treceval.txt")

In [3]:
m,n = dataset.doc_matrix.shape
dataset.doc_matrix.nnz/(m*n)

0.004691013235037071

In [4]:
epsilon = np.finfo(np.float64).eps
sqrt_epsilon = np.sqrt(epsilon)
v = np.random.rand(n)
v = v / np.linalg.norm(v)

In [5]:
def preprocess(k, A, v):
    m, n = A.shape
    q = [np.zeros(n) for i in range(k)]
    beta = 0
    alpha = 0
    q[0] = v
    memo = {}
    for i in range(k-1):
        q_hat = A.dot(q[i])
        w = A.T.dot(q_hat) - beta * q[i-1]
        alpha = w.dot(q[i])
        w = w - alpha * q[i]
        for j in range(i):
            w_dotq = w @ q[j]
            w = w - w_dotq * q[j]
        beta = np.linalg.norm(w)
        if beta == 0:
            break
        q[i+1] = w / beta
    return q

In [6]:
q = preprocess(300, dataset.doc_matrix, v)

In [7]:
def response(q, query, A):
    k = len(q)
    s_hat = A.T.dot(query)
    m, n = A.shape
    s = np.zeros(n) 
    for i in range(k-1):
        q_dot_query = q[i] @ s_hat
        s = s + q_dot_query * q[i]

    return s



In [8]:
query = dataset.query_vectors[:,0]
similaries = response(q, query, dataset.doc_matrix)

In [9]:
similaries

array([-0.00035625,  0.000507  ,  0.00445285, ..., -0.00360753,
       -0.00189454, -0.00765391])

# Vector Space Model ranked retrieval

Create the `vsm()` object.

In [10]:
vsm = VectorSpaceModel(dataset.doc_matrix)
m, n = vsm.A.shape

In [11]:
vsm.preprocess(150)

In [12]:
epsilon = np.finfo(np.float64).eps
epsilon = np.sqrt(epsilon)
epsilon

1.4901161193847656e-08

In [13]:
count = 0
for i in range(5):
    x = vsm.lanczos_vectors[i] @ vsm.lanczos_vectors[i]
    print(x)

0.9999999999999999
1.0
1.0
0.9999999999999997
1.0


In [14]:
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)

In [15]:
Ak = svd.fit_transform(vsm.A)

In [16]:
svd.components_.shape

(50, 1033)

In [17]:
dataset.query_vectors[:,0].T.dot(Ak).dot(np.diag(svd.singular_values_)).dot(svd.components_)

array([[0.00660732, 0.01935342, 0.00466614, ..., 0.00988035, 0.00471508,
        0.03267004]])

In [18]:
epsilon = np.finfo(np.float64).eps
epsilon

2.220446049250313e-16

In [19]:
vsm.A.shape

(13004, 1033)

In [20]:
vsm.response(dataset.query_vectors[:,0])

In [21]:
vsm.scores
scores = vsm.scores/vsm.norms
sorted_indices = np.argsort(-np.abs(scores))
sorted_indices[:37]

array([ 71, 168, 167, 211, 163, 141, 184, 169,  14, 499, 183, 210, 512,
       137, 171, 510, 166,  78,  12, 505, 180, 511, 170, 185, 179, 165,
       181,  13, 509, 502, 500, 508, 504, 212,  73, 503, 506], dtype=int64)

In [22]:
vsm.scores[212]

0.045748745910814466

In [23]:
sorted(sorted_indices[:37])

[12,
 13,
 14,
 71,
 73,
 78,
 137,
 141,
 163,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 179,
 180,
 181,
 183,
 184,
 185,
 210,
 211,
 212,
 499,
 500,
 502,
 503,
 504,
 505,
 506,
 508,
 509,
 510,
 511,
 512]

In [24]:
x, y = vsm.implicit_qr_algorithm(vsm.alpha, vsm.beta, tolerance=1e-15)

In [29]:
x

array([ 1.79715603e+01,  8.88131985e+00,  8.41478754e+00,  6.67575157e+00,
        6.20844678e+00,  6.17561241e+00,  5.63539267e+00,  5.50441227e+00,
        5.08545570e+00,  4.79266500e+00,  4.73060196e+00,  4.61355480e+00,
        4.37681778e+00,  4.05528828e+00,  3.96642218e+00,  3.82021092e+00,
        3.76811455e+00,  3.61501403e+00,  3.49159178e+00,  3.43351273e+00,
        3.35423259e+00,  3.27788190e+00,  3.19871330e+00,  3.14935981e+00,
        3.04860642e+00,  3.03882182e+00,  2.97759312e+00,  2.96740808e+00,
        2.87244349e+00,  2.81682278e+00,  2.80353877e+00,  2.74990625e+00,
        2.73798681e+00,  2.70762448e+00,  2.69588556e+00,  2.66066459e+00,
        2.63916856e+00,  2.57422117e+00,  2.57009090e+00,  2.53682185e+00,
        2.51846777e+00,  2.48253101e+00,  2.44329348e+00,  2.43360626e+00,
        2.41519653e+00,  2.37404760e+00,  2.34696990e+00,  2.33264604e+00,
        2.31076316e+00,  2.30407268e+00,  2.29037494e+00,  2.26535010e+00,
        2.24840170e+00,  

In [26]:
y[:,2] @ y[:,2]

1.0

In [27]:
y[:,2:]

array([[ 5.07160131e-002, -3.50446204e-002,  2.58631248e-002, ...,
        -1.36097989e-002,  8.51149399e-003, -2.21763294e-047],
       [-2.66371983e-002,  2.57281248e-002, -2.05326927e-002, ...,
         2.13595625e-002, -1.35081755e-002,  3.58711573e-047],
       [-2.64995697e-001,  1.64287445e-001, -1.15341736e-001, ...,
        -2.13618331e-002,  1.50891714e-002, -4.72639916e-047],
       ...,
       [ 0.00000000e+000,  0.00000000e+000,  4.43192161e-102, ...,
        -2.72677639e-015,  6.16264558e-017,  1.53724983e-001],
       [ 0.00000000e+000,  0.00000000e+000,  4.43639933e-103, ...,
         3.81398605e-016, -9.65199842e-018, -3.84365136e-001],
       [ 0.00000000e+000,  0.00000000e+000,  3.76948453e-104, ...,
         1.58880523e-015, -5.39757408e-017,  9.07983100e-001]])

In [28]:
u, s, v, lsi_norm = vsm.lsi_preprocess(50)

AttributeError: 'VectorSpaceModel' object has no attribute 'lsi_preprocess'

In [None]:
u = u[:,::-1]; s = s[::-1]; v = v[::-1,:]

In [None]:
s*s

array([17.97156031,  8.88648343,  8.40962395,  6.67575198,  6.20844665,
        6.17561213,  5.63539271,  5.50441223,  5.0854557 ,  4.792665  ,
        4.73060196,  4.6135548 ,  4.37681778,  4.05528842,  3.96642204,
        3.82021094,  3.76811452,  3.61501403,  3.49159178,  3.43351273,
        3.35423259,  3.2778819 ,  3.1987133 ,  3.14935981,  3.04956265,
        3.03786559,  2.97760444,  2.96739676,  2.87244349,  2.81682278,
        2.80353877,  2.74990626,  2.7379868 ,  2.70762448,  2.69588556,
        2.66066459,  2.63916856,  2.57427706,  2.57003502,  2.53682185,
        2.51846777,  2.48253101,  2.4432937 ,  2.43360604,  2.41519653,
        2.3740476 ,  2.34696991,  2.33276486,  2.31077417,  2.30418256])

In [None]:
u.dot(np.diag(s)).dot(v) - vsm.A

matrix([[-1.23851086e-03, -6.77642271e-03,  1.75600675e-03, ...,
         -2.73971600e-03,  3.23002952e-04,  4.62797551e-04],
        [ 3.24747405e-04,  1.86588899e-05,  6.66474952e-04, ...,
         -1.90873216e-04,  9.47459340e-04, -3.32363139e-04],
        [ 3.85245188e-04, -2.16897563e-03, -1.74493466e-03, ...,
         -8.78831343e-04, -1.85708090e-03,  8.11170417e-04],
        ...,
        [-2.48408938e-03, -7.39168914e-04, -7.38590473e-04, ...,
         -5.38463431e-04,  8.40047347e-04, -1.30317090e-03],
        [ 3.21195300e-04, -4.56321554e-04, -5.16076834e-04, ...,
         -1.35420452e-03, -6.13147852e-05,  3.53526526e-04],
        [ 1.47942354e-03, -1.32830014e-03,  6.10283709e-03, ...,
          1.65419214e-03, -2.33065922e-03, -3.41978820e-03]])

In [None]:
vsm.A

<13004x1033 sparse matrix of type '<class 'numpy.float64'>'
	with 63015 stored elements in Compressed Sparse Column format>

In [None]:
vsm.scores

array([0.00892056, 0.0121185 , 0.00450396, ..., 0.00237615, 0.00534256,
       0.00943086])

In [None]:
res = vsm.lsi_response(u, s, v, dataset.query_vectors[:,0])
res 

array([[-0.00069021, -0.00829348,  0.00223858, ..., -0.00834512,
         0.00210829,  0.01657744]])

In [None]:
epsilon = np.finfo(np.float64).eps
epsilon

2.220446049250313e-16

In [None]:
scores = vsm.scores[::-1]


In [None]:
np.linalg.norm(scores - res)

0.7528123726726272

In [None]:
res/lsi_norm

array([[-0.00082315, -0.01119094,  0.00291302, ..., -0.01268348,
         0.00344351,  0.02385357]])

In [None]:
vsm.scores/np.sqrt(vsm.norms)

array([0.06722867, 0.07689126, 0.03923905, ..., 0.01376622, 0.07265981,
       0.05847602])

In [None]:
np.sqrt(8.30)

2.8809720581775866