-
Notifications
You must be signed in to change notification settings - Fork 86
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added documentation #21
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
#!/usr/bin/env python | ||
|
||
from typing import List, Dict | ||
import math | ||
import numpy as np | ||
from multiprocessing import Pool, cpu_count | ||
|
@@ -14,45 +14,67 @@ | |
|
||
class BM25: | ||
def __init__(self, corpus, tokenizer=None): | ||
self.corpus_size = 0 | ||
self.avgdl = 0 | ||
self.doc_freqs = [] | ||
self.idf = {} | ||
self.doc_len = [] | ||
self.tokenizer = tokenizer | ||
self.corpus_size = 0 # total number of document in corpus | ||
self.avgdl = 0 # average length of a document in corpus | ||
self.doc_freqs = [] # list of dictionaries of term_frequency of each document | ||
self.idf = {} # idf score of each word in whole corpus | ||
self.doc_len = [] # list of length of each document in corpus | ||
self.tokenizer = tokenizer # user input tokenizer, defaults to none | ||
|
||
if tokenizer: | ||
corpus = self._tokenize_corpus(corpus) | ||
|
||
nd = self._initialize(corpus) | ||
self._calc_idf(nd) | ||
|
||
def _initialize(self, corpus): | ||
def _initialize(self, corpus) -> Dict: | ||
""" | ||
Args: | ||
corpus (List): list of the documents | ||
|
||
Returns: | ||
dict: number of document that contains the word | ||
|
||
Example: | ||
corpus = [['ram', 'is', 'a', 'good', 'boy'], ['ram', 'does', 'cycling', 'and', 'racing'], ['ram', 'is', 'healthy'], ['rita', 'likes', 'shyam'], ['good', 'luck']] | ||
nd = {'ram': 3, 'is': 2, 'a': 1, 'good': 2, 'boy': 1, 'does': 1, 'cycling': 1, 'and': 1, 'racing': 1, 'healthy': 1, 'rita': 1, 'likes': 1, 'shyam': 1, 'luck': 1} | ||
Comment on lines
+38
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shorten the examples so that I don't need to scroll. The functionality can also be explained only using 2 items in the list. |
||
|
||
""" | ||
nd = {} # word -> number of documents with word | ||
num_doc = 0 | ||
num_words = 0 # total number of words in whole corpus........... changed num_doc to num_words | ||
for document in corpus: | ||
self.doc_len.append(len(document)) | ||
num_doc += len(document) | ||
num_words += len(document) # total number of words in whole corpus | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function of variable |
||
|
||
frequencies = {} | ||
term_frequencies = ( | ||
{} | ||
) # term frequency of each word in a document........ changed frequencies to term_frequencies | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need to comment that you changed the name of variable. git keeps track of it. |
||
for word in document: | ||
if word not in frequencies: | ||
frequencies[word] = 0 | ||
frequencies[word] += 1 | ||
self.doc_freqs.append(frequencies) | ||
|
||
for word, freq in frequencies.items(): | ||
if word not in term_frequencies: | ||
term_frequencies[word] = 0 | ||
Comment on lines
+53
to
+54
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block of code can be removed by using |
||
term_frequencies[word] += 1 | ||
self.doc_freqs.append(term_frequencies) | ||
|
||
for word, freq in term_frequencies.items(): | ||
""" | ||
increase the size of 'nd' for word present in document of corpus by 1 otherwise set it to 1. | ||
""" | ||
try: | ||
nd[word]+=1 | ||
nd[word] += 1 | ||
except KeyError: | ||
nd[word] = 1 | ||
|
||
self.corpus_size += 1 | ||
self.corpus_size += 1 # increases the size of corpus after each loop until the end of document in corpus | ||
|
||
self.avgdl = num_doc / self.corpus_size | ||
self.avgdl = num_words / self.corpus_size | ||
return nd | ||
|
||
def _tokenize_corpus(self, corpus): | ||
"""tokenizes the corpus according to user input tokenizer | ||
|
||
Args: | ||
corpus (List): list of documents | ||
""" | ||
pool = Pool(cpu_count()) | ||
tokenized_corpus = pool.map(self.tokenizer, corpus) | ||
return tokenized_corpus | ||
|
@@ -67,8 +89,19 @@ def get_batch_scores(self, query, doc_ids): | |
raise NotImplementedError() | ||
|
||
def get_top_n(self, query, documents, n=5): | ||
"""sorts the score of documents in descending order to find the most similar document in corpus | ||
|
||
Args: | ||
query (List): list of token | ||
documents (List): list of documents | ||
n (int, optional): Top n documents. Defaults to 5. | ||
|
||
assert self.corpus_size == len(documents), "The documents given don't match the index corpus!" | ||
Returns: | ||
List: list of most similar document in descending order | ||
""" | ||
assert self.corpus_size == len( | ||
documents | ||
), "The documents given don't match the index corpus!" | ||
|
||
scores = self.get_scores(query) | ||
top_n = np.argsort(scores)[::-1][:n] | ||
|
@@ -77,15 +110,25 @@ def get_top_n(self, query, documents, n=5): | |
|
||
class BM25Okapi(BM25): | ||
def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25): | ||
""" | ||
Args: | ||
corpus (List): list of documents | ||
tokenizer (optional): user defined tokenizer. Defaults to None. | ||
k1 (float, optional): free parameter as k1 ∈ [ 1.2 , 2.0 ]. Defaults to 1.5. | ||
b (float, optional): free parameter. Defaults to 0.75. | ||
epsilon (float, optional): constant used for negative idf of document in corpus. Defaults to 0.25. | ||
""" | ||
self.k1 = k1 | ||
self.b = b | ||
self.epsilon = epsilon | ||
super().__init__(corpus, tokenizer) | ||
|
||
def _calc_idf(self, nd): | ||
""" | ||
Calculates frequencies of terms in documents and in corpus. | ||
"""Calculates frequencies of terms in documents and in corpus. | ||
This algorithm sets a floor on the idf values to eps * average_idf | ||
|
||
Args: | ||
nd (Dict): word -> number of documents with word | ||
""" | ||
# collect idf sum to calculate an average idf for epsilon value | ||
idf_sum = 0 | ||
|
@@ -105,19 +148,29 @@ def _calc_idf(self, nd): | |
self.idf[word] = eps | ||
|
||
def get_scores(self, query): | ||
""" | ||
The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores, | ||
"""The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores, | ||
this algorithm also adds a floor to the idf value of epsilon. | ||
See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info | ||
:param query: | ||
:return: | ||
|
||
Args: | ||
query (list): tokenized document | ||
|
||
Returns: | ||
Array: score of each token in a corpus | ||
""" | ||
score = np.zeros(self.corpus_size) | ||
score = np.zeros( | ||
self.corpus_size | ||
) # array of score of each token in a document in corpus | ||
doc_len = np.array(self.doc_len) | ||
for q in query: | ||
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs]) | ||
score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) / | ||
(q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))) | ||
q_freq = np.array( | ||
[(doc.get(q) or 0) for doc in self.doc_freqs] | ||
) # term freq of a term 'q' in a document | ||
score += (self.idf.get(q) or 0) * ( | ||
q_freq | ||
* (self.k1 + 1) | ||
/ (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)) | ||
) | ||
return score | ||
|
||
def get_batch_scores(self, query, doc_ids): | ||
|
@@ -129,8 +182,11 @@ def get_batch_scores(self, query, doc_ids): | |
doc_len = np.array(self.doc_len)[doc_ids] | ||
for q in query: | ||
q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids]) | ||
score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) / | ||
(q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))) | ||
score += (self.idf.get(q) or 0) * ( | ||
q_freq | ||
* (self.k1 + 1) | ||
/ (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)) | ||
) | ||
return score.tolist() | ||
|
||
|
||
|
@@ -153,8 +209,13 @@ def get_scores(self, query): | |
for q in query: | ||
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs]) | ||
ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl) | ||
score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \ | ||
(self.k1 + ctd + self.delta) | ||
score += ( | ||
(self.idf.get(q) or 0) | ||
* q_freq | ||
* (self.k1 + 1) | ||
* (ctd + self.delta) | ||
/ (self.k1 + ctd + self.delta) | ||
) | ||
return score | ||
|
||
def get_batch_scores(self, query, doc_ids): | ||
|
@@ -167,8 +228,13 @@ def get_batch_scores(self, query, doc_ids): | |
for q in query: | ||
q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids]) | ||
ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl) | ||
score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \ | ||
(self.k1 + ctd + self.delta) | ||
score += ( | ||
(self.idf.get(q) or 0) | ||
* q_freq | ||
* (self.k1 + 1) | ||
* (ctd + self.delta) | ||
/ (self.k1 + ctd + self.delta) | ||
) | ||
return score.tolist() | ||
|
||
|
||
|
@@ -190,8 +256,11 @@ def get_scores(self, query): | |
doc_len = np.array(self.doc_len) | ||
for q in query: | ||
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs]) | ||
score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) / | ||
(self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq)) | ||
score += (self.idf.get(q) or 0) * ( | ||
self.delta | ||
+ (q_freq * (self.k1 + 1)) | ||
/ (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq) | ||
) | ||
return score | ||
|
||
def get_batch_scores(self, query, doc_ids): | ||
|
@@ -203,8 +272,11 @@ def get_batch_scores(self, query, doc_ids): | |
doc_len = np.array(self.doc_len)[doc_ids] | ||
for q in query: | ||
q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids]) | ||
score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) / | ||
(self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq)) | ||
score += (self.idf.get(q) or 0) * ( | ||
self.delta | ||
+ (q_freq * (self.k1 + 1)) | ||
/ (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq) | ||
) | ||
return score.tolist() | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is
nd
here? You should explain it