In [1]:
from load_data_json import LoadDataset 
from preprocess import VectorSpaceModel
import numpy as np
import time
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds, eigsh

## Load the dataset
Functions to load the dataset and the given queries with associated relevant documents.

In [2]:
dataset = LoadDataset("./data/med/med.json", "./data/med/queries.json", "./data/med/qrels-treceval.txt")

# Vector Space Model ranked retrieval

Create the `vsm()` object.

In [3]:
vsm = VectorSpaceModel(dataset.doc_matrix)
m, n = vsm.A.shape

In [4]:
alpha, beta, q, a = vsm.preprocess(60)

In [5]:
vsm.response(alpha, beta, q, dataset.query_vectors[:,0]).shape

(1033,)

In [6]:
x, y = vsm.implicit_qr_algorithm(alpha, beta, tolerance=1e-10)

In [7]:
x

array([ 1.79715603e+01,  8.87475193e+00,  8.42135545e+00,  6.67575043e+00,
        6.20844720e+00,  6.17561312e+00,  5.63539265e+00,  5.50441229e+00,
        5.08545570e+00,  4.79266500e+00,  4.73060196e+00,  4.61355480e+00,
        4.37681778e+00,  4.05515753e+00,  3.96655294e+00,  3.82021092e+00,
        3.76811454e+00,  3.61501328e+00,  3.49159083e+00,  3.43351020e+00,
        3.35420769e+00,  3.19873044e+00,  3.11235775e+00,  3.03885358e+00,
        2.94954452e+00,  2.85844165e+00,  2.73938073e+00,  2.65032894e+00,
        2.53326716e+00,  2.42617101e+00,  2.30692138e+00,  2.18349031e+00,
        2.06550498e+00,  1.94692166e+00,  1.83314524e+00,  1.74148955e+00,
        1.59077839e+00,  1.50230738e+00,  1.39031875e+00,  1.28038405e+00,
        1.18870768e+00,  1.08110876e+00,  9.92536263e-01,  9.03751306e-01,
        8.16198941e-01,  7.35558599e-01,  6.58237528e-01,  5.86569530e-01,
        5.15722993e-01,  4.56689402e-01,  4.04112972e-01,  3.50722328e-01,
        3.09991560e-01,  

In [8]:
u, s, vt = svds(vsm.A, k=50)

In [9]:
s*s

array([ 2.30418256,  2.31077417,  2.33276486,  2.34696991,  2.3740476 ,
        2.41519653,  2.43360604,  2.4432937 ,  2.48253101,  2.51846777,
        2.53682185,  2.57003502,  2.57427706,  2.63916856,  2.66066459,
        2.69588556,  2.70762448,  2.7379868 ,  2.74990626,  2.80353877,
        2.81682278,  2.87244349,  2.96739676,  2.97760444,  3.03786559,
        3.04956265,  3.14935981,  3.1987133 ,  3.2778819 ,  3.35423259,
        3.43351273,  3.49159178,  3.61501403,  3.76811452,  3.82021094,
        3.96642204,  4.05528842,  4.37681778,  4.6135548 ,  4.73060196,
        4.792665  ,  5.0854557 ,  5.50441223,  5.63539271,  6.17561213,
        6.20844665,  6.67575198,  8.40962395,  8.88648343, 17.97156031])

In [10]:
x, y = vsm.bisec_PDDP()

In [17]:
x1, x2 = vsm.bisec_PDDP(x)

In [22]:
x2

array([ 12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  26,  30,
        31,  33,  34,  35,  37,  38,  39,  40,  42,  43,  45,  46,  49,
        51,  55,  58,  59,  60,  64,  65,  66,  67,  68,  69,  70,  71,
        75,  84,  85,  91,  92,  93,  94,  95,  96,  97, 119, 121, 126,
       129, 130, 131, 132, 133, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 150, 154, 161, 171, 177, 178, 179, 184,
       192, 194, 195, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
       212, 214, 215, 216, 218, 219, 220, 221, 222, 223, 224, 225, 229,
       240, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
       255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267,
       268, 269, 270, 271, 272, 273, 274, 275, 307, 309, 310, 311, 312,
       314, 316, 317, 320, 321, 322, 323, 324, 325, 327, 331, 332, 334,
       335, 336, 338, 340, 341, 342, 344, 345, 347, 351, 352, 354, 357,
       358, 359, 362, 364, 365, 367, 368, 371, 377, 378, 379, 38

In [12]:
np.quantile(x,.55)

522.85

In [13]:
np.max(x)

1030

In [14]:
median = np.median(x)

In [15]:
left = np.where(x >= median)[0]
vsm.A[left,:]

<284x1033 sparse matrix of type '<class 'numpy.float64'>'
	with 984 stored elements in Compressed Sparse Column format>