In [1]:
from load_data_json import LoadDataset 
from preprocess import VectorSpaceModel
import numpy as np

## Load the dataset
Functions to load the dataset and the given queries with associated relevant documents.

In [2]:
dataset = LoadDataset("./data/med/med.json", "./data/med/queries.json", "./data/med/qrels-treceval.txt")

print(dataset.doc_matrix.shape)
print(dataset.query_vectors.shape)

(13004, 1033)
(13004, 30)


In [3]:
print(dataset.query_vectors[:,0])

  (6870, 0)	0.3861302751188736
  (6094, 0)	0.4301832750283395
  (5743, 0)	0.6021247465649711
  (3118, 0)	0.5507191208743205


# Vector Space Model ranked retrieval

Create the `vsm()` object.

In [3]:
vsm = VectorSpaceModel(dataset.doc_matrix)
print(vsm.A.shape)

(13004, 1033)


In [4]:
A_T = vsm.A.transpose()
v = np.random.rand(13004)
q = v / np.linalg.norm(v)
A_T.dot(q)

array([0.03112345, 0.05077757, 0.04616636, ..., 0.05873582, 0.06943247,
       0.07040629])

In [3]:
print(np.finfo(float).eps)

2.220446049250313e-16


In [4]:
vsm.preprocess(100)

0


In [5]:
vsm.lanczos_vectors

[array([0.02953393, 0.03404758, 0.05116358, ..., 0.01105339, 0.05166173,
        0.0283999 ]),
 array([-0.0018994 ,  0.00748001, -0.02552774, ...,  0.05438032,
        -0.04274423,  0.02499027]),
 array([ 0.0606613 ,  0.02597268, -0.004243  , ..., -0.00137907,
        -0.01001159, -0.03827206]),
 array([-0.0344269 , -0.01808052,  0.01040766, ..., -0.02244237,
         0.02913408,  0.01320696]),
 array([ 0.02394765,  0.0607454 , -0.00082289, ...,  0.01165395,
        -0.04496608, -0.01157485]),
 array([ 0.03746176, -0.01018725,  0.03254818, ..., -0.04931002,
         0.03189908, -0.01313341]),
 array([ 0.03933549,  0.01100386, -0.03457655, ..., -0.00013585,
        -0.02651992,  0.01153597]),
 array([-0.09202104,  0.02493435,  0.02107183, ..., -0.00289715,
         0.00408988, -0.03256058]),
 array([ 0.01281553, -0.04469506, -0.00342523, ..., -0.0226512 ,
         0.02002602, -0.00592423]),
 array([-0.02854304, -0.00014451,  0.03363937, ...,  0.01156946,
        -0.05210681,  0.0023486 

In [5]:
count = 0
for i in range(100):
    for j in range(i):
        x = np.dot(vsm.lanczos_vectors[i],vsm.lanczos_vectors[j])
        if abs(x) > np.sqrt(np.finfo(float).eps):
            count += 1
            print(x)
print(count)

2.906485776534018e-08
1.9933821326229006e-08
2.3536050141118037e-07
1.614194912180622e-07
2.0694240373403394e-08
1.9638909101154373e-06
1.3469136509345822e-06
1.726765564494248e-07
3.2807830140091365e-08
1.9294795452270887e-05
1.3233129855716105e-05
1.6965092361339792e-06
3.22329869242681e-07
6.509470768835401e-08
0.00020695875526839397
0.00014194045704237976
1.8197002869802664e-05
3.457357201948688e-06
6.982157440202363e-07
9.199375822655648e-08
0.0022882418587168683
0.001569366296680308
0.00020119537331728457
3.822631208444567e-05
7.719832515979547e-06
1.0171312946841843e-06
1.5615712928296333e-07
3.267880785591537e-08
0.027268118119878955
0.018701548261517813
0.002397569645682728
0.0004555286032283841
9.19943518062176e-05
1.2120776910000755e-05
1.8608720056146472e-06
3.894266262533669e-07
6.017024023821649e-08
0.25969495897010453
0.1781090204780098
0.022833873199661688
0.004338344233687102
0.0008761319820724464
0.00011543536738910579
1.7722518097521012e-05
3.708826030693335e-06
5.73

In [6]:
vsm.response(dataset.query_vectors[:,0])

ValueError: dimension mismatch

In [None]:
print("The inverted index of the collection: first 10 terms and first 10 documents per term")
for t, v in list(vsm.index.items())[:10]:
    print("Term: ", t, "\tDocs: ", list(v)[:10], "\n")

The inverted index of the collection: first 10 terms and first 10 documents per term
Term:  correl 	Docs:  [0, 385, 904, 905, 394, 911, 147, 148, 277, 153] 

Term:  matern 	Docs:  [0, 4, 5, 11, 303, 706, 324, 326, 328, 331] 

Term:  fetal 	Docs:  [0, 1, 2, 3, 4, 5, 904, 11, 277, 936] 

Term:  plasma 	Docs:  [0, 1024, 4, 5, 516, 1031, 397, 273, 147, 149] 

Term:  level 	Docs:  [0, 1, 1024, 3, 516, 517, 9, 523, 524, 525] 

Term:  glucos 	Docs:  [0, 640, 4, 518, 9, 146, 413, 297, 564, 181] 

Term:  free 	Docs:  [0, 385, 3, 4, 393, 11, 652, 275, 659, 281] 

Term:  fatti 	Docs:  [0, 386, 4, 5, 6, 7, 9, 137, 157, 158] 

Term:  acid 	Docs:  [0, 1, 4, 5, 6, 7, 520, 9, 1030, 526] 

Term:  coeffici 	Docs:  [0, 40, 305, 401, 532, 501, 502, 628, 507] 



Visualize the documents as vectors.

In [None]:
docs_vector = vsm.docs_as_vectors()

In [None]:
docs_vector.shape 

(1032, 9685)

Visualize the queries as vectors.

In [None]:
example_query = queries[1]
print("Example query: ", example_query)

Example query:   the crystalline lens in vertebrates including humans


In [None]:
query_vector = vsm.query_as_vector(example_query)

In [None]:
query_vector.shape

(9685,)

Compute the relevance scores as the cosine similarity.

In [None]:
scores = vsm.relevance_scores(example_query)

In [None]:
print("The relevance scores for the given query, for each document:")
for d, s in list(scores.items())[:10]:
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the given query, for each document:
DocID:  0 	Score:  0 

DocID:  1 	Score:  0 

DocID:  2 	Score:  0 

DocID:  3 	Score:  0.021291796307282396 

DocID:  4 	Score:  0 

DocID:  5 	Score:  0 

DocID:  6 	Score:  0 

DocID:  7 	Score:  0 

DocID:  8 	Score:  0.056132485052834566 

DocID:  9 	Score:  0 



Return the top 10 documents given the query.

In [None]:
example_query

' the crystalline lens in vertebrates including humans'

In [None]:
k = 10
top10 = vsm.vector_space_model(example_query, k)

In [None]:
print(f"The relevance scores for the top {k} documents:")
for d, s in list(top10.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents:
DocID:  71 	Score:  0.32326427430871757 

DocID:  499 	Score:  0.2662094450787526 

DocID:  180 	Score:  0.2179920012480751 

DocID:  170 	Score:  0.19139097770770058 

DocID:  14 	Score:  0.19027283535863754 

DocID:  964 	Score:  0.18674186882310087 

DocID:  165 	Score:  0.18475982417434741 

DocID:  512 	Score:  0.17601476449236989 

DocID:  137 	Score:  0.16698722946249522 

DocID:  359 	Score:  0.1503014631326245 



### Relevance Feedback

In [None]:
example_relevant = relevance[1]
print("Relevant documents for the example query: ", example_relevant)

Relevant documents for the example query:  [12, 13, 14, 71, 78, 137, 141, 163, 164, 165, 166, 167, 168, 169, 170, 171, 179, 180, 181, 182, 183, 184, 185, 210, 211, 498, 499, 500, 501, 502, 503, 505, 506, 507, 509, 510, 512]


In [None]:
example_non_relevant = [i for i in range(vsm.n_docs) if i not in example_relevant]
print("20 of the non relevant documents for the example query: ", example_non_relevant[:20])

20 of the non relevant documents for the example query:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 19, 20, 21, 22]


In [None]:
opt_query = vsm.relevance_feedback_rocchio(example_query, example_relevant, example_non_relevant, alpha=1, beta=.75, gamma=.15)

In [None]:
top10_rel = vsm.vector_space_model(opt_query, k)

In [None]:
print(f"The relevance scores for the top {k} documents retrieved using relevance feedback:")
for d, s in list(top10_rel.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents retrieved using relevance feedback:
DocID:  498 	Score:  0.4975825553600744 

DocID:  180 	Score:  0.48248989126391234 

DocID:  510 	Score:  0.472073021470223 

DocID:  12 	Score:  0.4560654940412232 

DocID:  179 	Score:  0.45186607965425535 

DocID:  508 	Score:  0.44791318658586426 

DocID:  164 	Score:  0.42581831309501617 

DocID:  165 	Score:  0.42358178294680243 

DocID:  499 	Score:  0.4230008437357246 

DocID:  503 	Score:  0.4203885699026642 



### Pseudo-Relevance Feedback

In [None]:
opt_query_pseudo = vsm.pseudo_relevance_feedback(example_query, k=10)

In [None]:
top10_pseudo = vsm.vector_space_model(opt_query_pseudo, k)

In [None]:
print(f"The relevance scores for the top {k} documents retrieved using pseudo relevance feedback:")
for d, s in list(top10_pseudo.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents retrieved using pseudo relevance feedback:
DocID:  14 	Score:  0.5571136148407742 

DocID:  165 	Score:  0.5392188837587554 

DocID:  512 	Score:  0.5216182443784564 

DocID:  499 	Score:  0.48581512098078977 

DocID:  180 	Score:  0.46169998787542205 

DocID:  170 	Score:  0.4382538945134972 

DocID:  71 	Score:  0.4340505106725038 

DocID:  964 	Score:  0.3458494017629007 

DocID:  137 	Score:  0.3081767502378574 

DocID:  12 	Score:  0.30238538545091015 



### Performance evaluation

In [None]:
import numpy as np

Computing precision and recall for the given set of queries and relevant documents.

In [None]:
map = dict()
prec = dict()
rec = dict()

min_k = 1
max_k = max([len(i) for i in relevance.values()])
prec, rec = vsm.precision_recall(queries, relevance, min_k, max_k)

In [None]:
prec_rel, rec_rel = vsm.precision_recall(queries, relevance, min_k, max_k, rel_feedback=True)

In [None]:
prec_pseudo, rec_pseudo = vsm.precision_recall(queries, relevance, min_k, max_k, pseudo_feedback=True)

### Average precision and recall and Mean Average Precision (MAP)

In [None]:
def max_avg_prec(precision, queries, relevance):
    map = 0
    for qid in list(queries.keys()):
        ap = 0
        mj = len(relevance[qid])
        for k in range(1, mj+1):
            ap += precision[k][qid-1]
        map += ap / mj
    map /= len(queries.keys())
    return map

1. For the "standard" case

In [None]:
for k in range(5, 15):
    print(f"K={k}\tAverage Precision: {np.mean(prec[k])}\t Average Recall: {np.mean(rec[k])}")

K=5	Average Precision: 0.6827586206896552	 Average Recall: 0.16281932949952388
K=6	Average Precision: 0.6551724137931034	 Average Recall: 0.19058144148202566
K=7	Average Precision: 0.6403940886699508	 Average Recall: 0.21561975105582612
K=8	Average Precision: 0.6422413793103449	 Average Recall: 0.2472395322148463
K=9	Average Precision: 0.6513409961685824	 Average Recall: 0.2792880888851007
K=10	Average Precision: 0.6413793103448275	 Average Recall: 0.3074866616191644
K=11	Average Precision: 0.6175548589341692	 Average Recall: 0.32182840087469666
K=12	Average Precision: 0.6091954022988506	 Average Recall: 0.3435653605129643
K=13	Average Precision: 0.5941644562334217	 Average Recall: 0.361982461771145
K=14	Average Precision: 0.5837438423645319	 Average Recall: 0.3843476271535518


In [None]:
map = max_avg_prec(prec, queries, relevance)

In [None]:
print(f"MAP: {map}")

MAP: 0.6296379359580575


2. Allowing relevance feedback - considering as relevant the known set of relevant documents

In [None]:
for k in range(5, 15):
    print(f"K={k}\tAverage Precision: {np.mean(prec_rel[k])}\t Average Recall: {np.mean(rec_rel[k])}")

K=5	Average Precision: 0.9586206896551723	 Average Recall: 0.23514480661536444
K=6	Average Precision: 0.9540229885057472	 Average Recall: 0.28206434548418724
K=7	Average Precision: 0.9458128078817735	 Average Recall: 0.32513901843917864
K=8	Average Precision: 0.9482758620689655	 Average Recall: 0.37123503786379247
K=9	Average Precision: 0.9425287356321839	 Average Recall: 0.41206285805468973
K=10	Average Precision: 0.9344827586206896	 Average Recall: 0.45115531341052195
K=11	Average Precision: 0.9247648902821317	 Average Recall: 0.49087010231597417
K=12	Average Precision: 0.9137931034482759	 Average Recall: 0.5269035789194244
K=13	Average Precision: 0.8992042440318303	 Average Recall: 0.5588493169523775
K=14	Average Precision: 0.8866995073891624	 Average Recall: 0.5895345976903904


In [None]:
map_rel = max_avg_prec(prec_rel, queries, relevance)

In [None]:
print(f"MAP allowing relevance feedback: {map_rel}")

MAP allowing relevance feedback: 0.9321421215043031


3. Allowing pseudo-relevance feedback - considering as relevant the top 10 documents retrieved

In [None]:
for k in range(5, 15):
    print(f"K={k}\tAverage Precision: {np.mean(prec_pseudo[k])}\t Average Recall: {np.mean(rec_pseudo[k])}")

K=5	Average Precision: 0.7793103448275862	 Average Recall: 0.19113517390258328
K=6	Average Precision: 0.7586206896551723	 Average Recall: 0.22187928788908468
K=7	Average Precision: 0.7389162561576356	 Average Recall: 0.24887198088278584
K=8	Average Precision: 0.7112068965517241	 Average Recall: 0.27617769102957546
K=9	Average Precision: 0.7011494252873562	 Average Recall: 0.30235656890286466
K=10	Average Precision: 0.6896551724137929	 Average Recall: 0.32787548666316174
K=11	Average Precision: 0.677115987460815	 Average Recall: 0.35559589840557104
K=12	Average Precision: 0.6781609195402301	 Average Recall: 0.3871030980689513
K=13	Average Precision: 0.6631299734748012	 Average Recall: 0.4105198274904371
K=14	Average Precision: 0.6576354679802955	 Average Recall: 0.4341823353436082


In [None]:
map_pseudo = max_avg_prec(prec_pseudo, queries, relevance)

In [None]:
print(f"MAP allowing pseudo-relevance feedback: {map_pseudo}")

MAP allowing pseudo-relevance feedback: 0.7072055147428038
