In [1]:
from load_data import LoadDataset 
from vsm import VectorSpaceModel

## Load the dataset
Functions to load the dataset and the given queries with associated relevant documents.

In [2]:
dataset = LoadDataset("./med_data/MED.ALL", "./med_data/MED.QRY", "./med_data/MED.REL")

docs = dataset.load_docs()
queries = dataset.load_queries()
relevance = dataset.load_relevance()

# Vector Space Model ranked retrieval

Create the `vsm()` object.

In [3]:
vsm = VectorSpaceModel(docs)

In [5]:
print("Number of documents in the collection: ", vsm.n_docs)
print("Number of terms in the collection: ", vsm.n_terms)

Number of documents in the collection:  1032
Number of terms in the collection:  9685


In [6]:
print("The inverted index of the collection: first 10 terms and first 10 documents per term")
for t, v in list(vsm.index.items())[:10]:
    print("Term: ", t, "\tDocs: ", list(v)[:10], "\n")

The inverted index of the collection: first 10 terms and first 10 documents per term
Term:  correl 	Docs:  [0, 385, 904, 905, 394, 911, 147, 148, 277, 153] 

Term:  matern 	Docs:  [0, 4, 5, 11, 303, 706, 324, 326, 328, 331] 

Term:  fetal 	Docs:  [0, 1, 2, 3, 4, 5, 904, 11, 277, 936] 

Term:  plasma 	Docs:  [0, 1024, 4, 5, 516, 1031, 397, 273, 147, 149] 

Term:  level 	Docs:  [0, 1, 1024, 3, 516, 517, 9, 523, 524, 525] 

Term:  glucos 	Docs:  [0, 640, 4, 518, 9, 146, 413, 297, 564, 181] 

Term:  free 	Docs:  [0, 385, 3, 4, 393, 11, 652, 275, 659, 281] 

Term:  fatti 	Docs:  [0, 386, 4, 5, 6, 7, 9, 137, 157, 158] 

Term:  acid 	Docs:  [0, 1, 4, 5, 6, 7, 520, 9, 1030, 526] 

Term:  coeffici 	Docs:  [0, 40, 305, 401, 532, 501, 502, 628, 507] 



Visualize the documents as vectors.

In [7]:
docs_vector = vsm.docs_as_vectors()

In [8]:
docs_vector.shape

(1032, 9685)

Visualize the queries as vectors.

In [9]:
example_query = queries[1]
print("Example query: ", example_query)

Example query:   the crystalline lens in vertebrates including humans


In [10]:
query_vector = vsm.query_as_vector(example_query)

In [11]:
query_vector.shape

(9685,)

Compute the relevance scores as the cosine similarity.

In [12]:
scores = vsm.relevance_scores(example_query)

In [13]:
print("The relevance scores for the given query, for each document:")
for d, s in list(scores.items())[:10]:
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the given query, for each document:
DocID:  0 	Score:  0 

DocID:  1 	Score:  0 

DocID:  2 	Score:  0 

DocID:  3 	Score:  0.021291796307282396 

DocID:  4 	Score:  0 

DocID:  5 	Score:  0 

DocID:  6 	Score:  0 

DocID:  7 	Score:  0 

DocID:  8 	Score:  0.056132485052834566 

DocID:  9 	Score:  0 



Return the top 10 documents given the query.

In [14]:
k = 10
top10 = vsm.vector_space_model(example_query, k)

In [15]:
print(f"The relevance scores for the top {k} documents:")
for d, s in list(top10.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents:
DocID:  71 	Score:  0.32326427430871757 

DocID:  499 	Score:  0.2662094450787526 

DocID:  180 	Score:  0.2179920012480751 

DocID:  170 	Score:  0.19139097770770058 

DocID:  14 	Score:  0.19027283535863754 

DocID:  964 	Score:  0.18674186882310087 

DocID:  165 	Score:  0.18475982417434741 

DocID:  512 	Score:  0.17601476449236989 

DocID:  137 	Score:  0.16698722946249522 

DocID:  359 	Score:  0.1503014631326245 



### Relevance Feedback

In [16]:
example_relevant = relevance[1]
print("Relevant documents for the example query: ", example_relevant)

Relevant documents for the example query:  [13, 14, 15, 72, 79, 138, 142, 164, 165, 166, 167, 168, 169, 170, 171, 172, 180, 181, 182, 183, 184, 185, 186, 211, 212, 499, 500, 501, 502, 503, 504, 506, 507, 508, 510, 511, 513]


In [17]:
example_non_relevant = [i for i in range(vsm.n_docs) if i not in example_relevant]
print("20 of the non relevant documents for the example query: ", example_non_relevant[:20])

20 of the non relevant documents for the example query:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 21, 22]


In [18]:
example_query

' the crystalline lens in vertebrates including humans'

In [19]:
opt_query = vsm.relevance_feedback_rocchio(example_query, example_relevant, example_non_relevant, alpha=1, beta=.75, gamma=.15)

In [20]:
opt_query

array([0.08659579, 0.        , 0.05318372, ..., 0.        , 0.        ,
       0.        ])

In [21]:
top10_rel = vsm.vector_space_model(opt_query, k)

In [22]:
print(f"The relevance scores for the top {k} documents retrieved using relevance feedback:")
for d, s in list(top10_rel.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents retrieved using relevance feedback:
DocID:  164 	Score:  0.48137913344181765 

DocID:  182 	Score:  0.46978869223488456 

DocID:  510 	Score:  0.45063041448047125 

DocID:  508 	Score:  0.4363811684084592 

DocID:  165 	Score:  0.4323930904084311 

DocID:  499 	Score:  0.4113570032705428 

DocID:  180 	Score:  0.40039072181203417 

DocID:  503 	Score:  0.39874443901547724 

DocID:  14 	Score:  0.3824796119419261 

DocID:  12 	Score:  0.3790946134701084 



### Pseudo-Relevance Feedback

In [23]:
opt_query_pseudo = vsm.pseudo_relevance_feedback(example_query, k)

In [24]:
opt_query_pseudo

array([0., 0., 0., ..., 0., 0., 0.])

In [25]:
top10_pseudo = vsm.vector_space_model(opt_query_pseudo, k)

In [26]:
print(f"The relevance scores for the top {k} documents retrieved using relevance feedback:")
for d, s in list(top10_pseudo.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents retrieved using relevance feedback:
DocID:  14 	Score:  0.5571136148407742 

DocID:  165 	Score:  0.5392188837587554 

DocID:  512 	Score:  0.5216182443784564 

DocID:  499 	Score:  0.48581512098078977 

DocID:  180 	Score:  0.46169998787542205 

DocID:  170 	Score:  0.4382538945134972 

DocID:  71 	Score:  0.4340505106725038 

DocID:  964 	Score:  0.3458494017629007 

DocID:  137 	Score:  0.3081767502378574 

DocID:  12 	Score:  0.30238538545091015 



### Performance evaluation

In [27]:
import numpy as np

Computing precision and recall for the given set of queries and relevant documents.

In [28]:
map = dict()
prec = dict()
rec = dict()

min_k = 1
max_k = max([len(i) for i in relevance.values()])
prec, rec = vsm.precision_recall(queries, relevance, min_k, max_k)

In [29]:
prec_rel, rec_rel = vsm.precision_recall(queries, relevance, min_k, max_k, rel_feedback=True)

In [30]:
prec_pseudo, rec_pseudo = vsm.precision_recall(queries, relevance, min_k, max_k, pseudo_feedback=True)

### Average precision and recall and Mean Average Precision (MAP)

In [51]:
def max_avg_prec(precision, queries, relevance):
    map = 0
    for qid in list(queries.keys()):
        ap = 0
        mj = len(relevance[qid])
        for k in range(1, mj+1):
            ap += precision[k][qid-1]
        map += ap / mj
    map /= len(queries.keys())
    return map

1. For the "standard" case

In [35]:
for k in range(5, 15):
    print(f"K={k}\tAverage Precision: {np.mean(prec[k])}\t Average Recall: {np.mean(rec[k])}")

K=5	Average Precision: 0.503448275862069	 Average Recall: 0.11395175215691679
K=6	Average Precision: 0.4655172413793103	 Average Recall: 0.12483155309319811
K=7	Average Precision: 0.46305418719211816	 Average Recall: 0.14337418082648914
K=8	Average Precision: 0.46120689655172414	 Average Recall: 0.1659212856714537
K=9	Average Precision: 0.46360153256704983	 Average Recall: 0.18679401309215715
K=10	Average Precision: 0.4517241379310344	 Average Recall: 0.20097312896568442
K=11	Average Precision: 0.43573667711598746	 Average Recall: 0.21166300603487181
K=12	Average Precision: 0.43103448275862066	 Average Recall: 0.22714717062724096
K=13	Average Precision: 0.42175066312997345	 Average Recall: 0.2384472828718809
K=14	Average Precision: 0.4137931034482758	 Average Recall: 0.25252164901521257


In [52]:
map = max_avg_prec(prec, queries, relevance)

In [53]:
print(f"MAP: {map}")

MAP: 0.4228927833802282


2. Allowing relevance feedback - considering as relevant the known set of relevant documents

In [38]:
for k in range(5, 15):
    print(f"K={k}\tAverage Precision: {np.mean(prec_rel[k])}\t Average Recall: {np.mean(rec_rel[k])}")

K=5	Average Precision: 0.9172413793103448	 Average Recall: 0.2259688792497819
K=6	Average Precision: 0.8908045977011494	 Average Recall: 0.26360931783474817
K=7	Average Precision: 0.8817733990147785	 Average Recall: 0.30121053704042383
K=8	Average Precision: 0.8663793103448276	 Average Recall: 0.3363296597984857
K=9	Average Precision: 0.8467432950191571	 Average Recall: 0.368287866895632
K=10	Average Precision: 0.8275862068965518	 Average Recall: 0.39942616343523646
K=11	Average Precision: 0.8181818181818181	 Average Recall: 0.431523824895975
K=12	Average Precision: 0.8103448275862069	 Average Recall: 0.4632116637954667
K=13	Average Precision: 0.8010610079575597	 Average Recall: 0.49229007439212685
K=14	Average Precision: 0.7807881773399015	 Average Recall: 0.5135820192750348


In [54]:
map_rel = max_avg_prec(prec_rel, queries, relevance)

In [55]:
print(f"MAP allowing relevance feedback: {map_rel}")

MAP allowing relevance feedback: 0.848416517934991


3. Allowing pseudo-relevance feedback - considering as relevant the top 10 documents retrieved

In [41]:
for k in range(5, 15):
    print(f"K={k}\tAverage Precision: {np.mean(prec_pseudo[k])}\t Average Recall: {np.mean(rec_pseudo[k])}")

K=5	Average Precision: 0.5310344827586208	 Average Recall: 0.12755112785498454
K=6	Average Precision: 0.5344827586206896	 Average Recall: 0.148454437690337
K=7	Average Precision: 0.5172413793103449	 Average Recall: 0.16565389279110007
K=8	Average Precision: 0.5129310344827587	 Average Recall: 0.18757374105678404
K=9	Average Precision: 0.5019157088122606	 Average Recall: 0.20422723440468876
K=10	Average Precision: 0.47931034482758617	 Average Recall: 0.21431363584067403
K=11	Average Precision: 0.47335423197492155	 Average Recall: 0.23456118450677207
K=12	Average Precision: 0.471264367816092	 Average Recall: 0.2541660393702476
K=13	Average Precision: 0.46949602122015915	 Average Recall: 0.2736465964035369
K=14	Average Precision: 0.4679802955665024	 Average Recall: 0.29401879622502075


In [56]:
map_pseudo = max_avg_prec(prec_pseudo, queries, relevance)

In [57]:
print(f"MAP allowing pseudo-relevance feedback: {map_pseudo}")

MAP allowing pseudo-relevance feedback: 0.4928729975303682
