In [1]:
from load_data import LoadDataset 
from vsm import VectorSpaceModel


## Load the dataset
Functions to load the dataset and the given queries with associated relevant documents.

In [2]:
dataset = LoadDataset("./med_data/MED.ALL", "./med_data/MED.QRY", "./med_data/MED.REL")

docs = dataset.load_docs()
queries = dataset.load_queries()
relevance = dataset.load_relevance()

# Vector Space Model ranked retrieval

Create the `vsm()` object.

In [3]:
vsm = VectorSpaceModel(docs)

In [4]:
print("Number of documents in the collection: ", vsm.n_docs)
print("Number of terms in the collection: ", vsm.n_terms)

Number of documents in the collection:  1032
Number of terms in the collection:  13391


In [5]:
print("The inverted index of the collection: first 10 terms and first 10 documents per term")
for t, v in list(vsm.index.items())[:10]:
    print("Term: ", t, "\tDocs: ", list(v)[:10], "\n")

The inverted index of the collection: first 10 terms and first 10 documents per term
Term:  correlation 	Docs:  [0, 385, 904, 394, 911, 147, 148, 153, 28, 291] 

Term:  between 	Docs:  [0, 1, 521, 527, 528, 535, 28, 31, 34, 43] 

Term:  maternal 	Docs:  [0, 4, 5, 11, 303, 706, 324, 326, 328, 331] 

Term:  and 	Docs:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 

Term:  fetal 	Docs:  [0, 1, 2, 3, 4, 5, 904, 11, 277, 936] 

Term:  plasma 	Docs:  [0, 1024, 4, 5, 516, 1031, 397, 273, 147, 149] 

Term:  levels 	Docs:  [0, 1, 1024, 3, 516, 9, 523, 525, 546, 37] 

Term:  of 	Docs:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 

Term:  glucose 	Docs:  [0, 640, 4, 518, 9, 146, 413, 297, 564, 181] 

Term:  free 	Docs:  [0, 385, 3, 4, 393, 11, 652, 275, 659, 281] 



Visualize the documents as vectors.

In [6]:
docs_vector = vsm.docs_as_vectors()

In [7]:
docs_vector.shape

(1032, 13391)

Visualize the queries as vectors.

In [8]:
example_query = queries[1]
print("Example query: ", example_query)

Example query:   the crystalline lens in vertebrates including humans



In [9]:
query_vector = vsm.query_as_vector(example_query)

In [10]:
query_vector.shape

(13391,)

Compute the relevance scores as the cosine similarity.

In [11]:
scores = vsm.relevance_scores(example_query)

 the crystalline lens in vertebrates including humans



In [12]:
print("The relevance scores for the given query, for each document:")
for d, s in list(scores.items())[:10]:
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the given query, for each document:
DocID:  0 	Score:  0.0014032417966615535 

DocID:  1 	Score:  0.00219209320430894 

DocID:  2 	Score:  0.0028177182779833415 

DocID:  3 	Score:  0.002592207814362385 

DocID:  4 	Score:  0.0023028674625397306 

DocID:  5 	Score:  0.004649004635595275 

DocID:  6 	Score:  0.003942636439281871 

DocID:  7 	Score:  0.0032190028663789693 

DocID:  8 	Score:  0.00013795678697045409 

DocID:  9 	Score:  0.002920021604681169 



Return the top 10 documents given the query.

In [13]:
k = 10
top10 = vsm.vector_space_model(example_query, k)

 the crystalline lens in vertebrates including humans



In [14]:
print(f"The relevance scores for the top {k} documents:")
for d, s in list(top10.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents:
DocID:  71 	Score:  0.3180694357629488 

DocID:  499 	Score:  0.22594730142540573 

DocID:  170 	Score:  0.1842523224171873 

DocID:  14 	Score:  0.1822016883555637 

DocID:  165 	Score:  0.1778324974124219 

DocID:  512 	Score:  0.17184035889935304 

DocID:  180 	Score:  0.15550121589189234 

DocID:  510 	Score:  0.14467894034360163 

DocID:  166 	Score:  0.14275038678813573 

DocID:  211 	Score:  0.14083698844959833 



### Relevance Feedback

In [15]:
example_relevant = relevance[1]
print("Relevant documents for the example query: ", example_relevant)

Relevant documents for the example query:  [13, 14, 15, 72, 79, 138, 142, 164, 165, 166, 167, 168, 169, 170, 171, 172, 180, 181, 182, 183, 184, 185, 186, 211, 212, 499, 500, 501, 502, 503, 504, 506, 507, 508, 510, 511, 513]


In [16]:
example_non_relevant = [i for i in range(vsm.n_docs) if i not in example_relevant]
print("20 of the non relevant documents for the example query: ", example_non_relevant[:20])

20 of the non relevant documents for the example query:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 21, 22]


In [17]:
example_query

' the crystalline lens in vertebrates including humans\n'

In [18]:
opt_query = vsm.relevance_feedback_rocchio(example_query, example_relevant, example_non_relevant, alpha=1, beta=.75, gamma=.15)

In [19]:
opt_query

array([0.11987246, 0.22220317, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [20]:
top10_rel = vsm.vector_space_model(opt_query, k)

In [21]:
print(f"The relevance scores for the top {k} documents retrieved using relevance feedback:")
for d, s in list(top10_rel.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents retrieved using relevance feedback:
DocID:  164 	Score:  0.4539546587351573 

DocID:  182 	Score:  0.4465463177324617 

DocID:  510 	Score:  0.4216986241309285 

DocID:  165 	Score:  0.4118433502552803 

DocID:  508 	Score:  0.3808872079532394 

DocID:  14 	Score:  0.3806552548864776 

DocID:  211 	Score:  0.36540995700060924 

DocID:  499 	Score:  0.35611341099109584 

DocID:  180 	Score:  0.3426052596002855 

DocID:  170 	Score:  0.34015780315783806 



### Pseudo-Relevance Feedback

In [22]:
opt_query_pseudo = vsm.pseudo_relevance_feedback(example_query, k)

 the crystalline lens in vertebrates including humans



In [23]:
opt_query_pseudo

array([0.        , 0.26362056, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [24]:
top10_pseudo = vsm.vector_space_model(opt_query_pseudo, k)

In [25]:
print(f"The relevance scores for the top {k} documents retrieved using relevance feedback:")
for d, s in list(top10_pseudo.items()):
    print("DocID: ", d, "\tScore: ", s, "\n")

The relevance scores for the top 10 documents retrieved using relevance feedback:
DocID:  14 	Score:  0.5502137218779055 

DocID:  211 	Score:  0.534970201061323 

DocID:  165 	Score:  0.5273389363410856 

DocID:  512 	Score:  0.5021942950561238 

DocID:  510 	Score:  0.4617498452246454 

DocID:  170 	Score:  0.4358152703121938 

DocID:  499 	Score:  0.433676570996206 

DocID:  180 	Score:  0.41642546220907606 

DocID:  71 	Score:  0.402139798750616 

DocID:  166 	Score:  0.38839310134669797 



### Performance evaluation

In [None]:
from evaluation import precision, recall