Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added documentation #21

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 113 additions & 41 deletions rank_bm25.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python

from typing import List, Dict
import math
import numpy as np
from multiprocessing import Pool, cpu_count
Expand All @@ -14,45 +14,67 @@

class BM25:
def __init__(self, corpus, tokenizer=None):
self.corpus_size = 0
self.avgdl = 0
self.doc_freqs = []
self.idf = {}
self.doc_len = []
self.tokenizer = tokenizer
self.corpus_size = 0 # total number of document in corpus
self.avgdl = 0 # average length of a document in corpus
self.doc_freqs = [] # list of dictionaries of term_frequency of each document
self.idf = {} # idf score of each word in whole corpus
self.doc_len = [] # list of length of each document in corpus
self.tokenizer = tokenizer # user input tokenizer, defaults to none

if tokenizer:
corpus = self._tokenize_corpus(corpus)

nd = self._initialize(corpus)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is nd here? You should explain it

self._calc_idf(nd)

def _initialize(self, corpus):
def _initialize(self, corpus) -> Dict:
"""
Args:
corpus (List): list of the documents

Returns:
dict: number of document that contains the word

Example:
corpus = [['ram', 'is', 'a', 'good', 'boy'], ['ram', 'does', 'cycling', 'and', 'racing'], ['ram', 'is', 'healthy'], ['rita', 'likes', 'shyam'], ['good', 'luck']]
nd = {'ram': 3, 'is': 2, 'a': 1, 'good': 2, 'boy': 1, 'does': 1, 'cycling': 1, 'and': 1, 'racing': 1, 'healthy': 1, 'rita': 1, 'likes': 1, 'shyam': 1, 'luck': 1}
Comment on lines +38 to +40

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shorten the examples so that I don't need to scroll. The functionality can also be explained only using 2 items in the list.


"""
nd = {} # word -> number of documents with word
num_doc = 0
num_words = 0 # total number of words in whole corpus........... changed num_doc to num_words
for document in corpus:
self.doc_len.append(len(document))
num_doc += len(document)
num_words += len(document) # total number of words in whole corpus

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function of variable num_words has already been explained.


frequencies = {}
term_frequencies = (
{}
) # term frequency of each word in a document........ changed frequencies to term_frequencies

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to comment that you changed the name of variable. git keeps track of it.

for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.doc_freqs.append(frequencies)

for word, freq in frequencies.items():
if word not in term_frequencies:
term_frequencies[word] = 0
Comment on lines +53 to +54

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block of code can be removed by using defaultdict instead of the normal dictionary.

term_frequencies[word] += 1
self.doc_freqs.append(term_frequencies)

for word, freq in term_frequencies.items():
"""
increase the size of 'nd' for word present in document of corpus by 1 otherwise set it to 1.
"""
try:
nd[word]+=1
nd[word] += 1
except KeyError:
nd[word] = 1

self.corpus_size += 1
self.corpus_size += 1 # increases the size of corpus after each loop until the end of document in corpus

self.avgdl = num_doc / self.corpus_size
self.avgdl = num_words / self.corpus_size
return nd

def _tokenize_corpus(self, corpus):
"""tokenizes the corpus according to user input tokenizer

Args:
corpus (List): list of documents
"""
pool = Pool(cpu_count())
tokenized_corpus = pool.map(self.tokenizer, corpus)
return tokenized_corpus
Expand All @@ -67,8 +89,19 @@ def get_batch_scores(self, query, doc_ids):
raise NotImplementedError()

def get_top_n(self, query, documents, n=5):
"""sorts the score of documents in descending order to find the most similar document in corpus

Args:
query (List): list of token
documents (List): list of documents
n (int, optional): Top n documents. Defaults to 5.

assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
Returns:
List: list of most similar document in descending order
"""
assert self.corpus_size == len(
documents
), "The documents given don't match the index corpus!"

scores = self.get_scores(query)
top_n = np.argsort(scores)[::-1][:n]
Expand All @@ -77,15 +110,25 @@ def get_top_n(self, query, documents, n=5):

class BM25Okapi(BM25):
def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
"""
Args:
corpus (List): list of documents
tokenizer (optional): user defined tokenizer. Defaults to None.
k1 (float, optional): free parameter as k1 ∈ [ 1.2 , 2.0 ]. Defaults to 1.5.
b (float, optional): free parameter. Defaults to 0.75.
epsilon (float, optional): constant used for negative idf of document in corpus. Defaults to 0.25.
"""
self.k1 = k1
self.b = b
self.epsilon = epsilon
super().__init__(corpus, tokenizer)

def _calc_idf(self, nd):
"""
Calculates frequencies of terms in documents and in corpus.
"""Calculates frequencies of terms in documents and in corpus.
This algorithm sets a floor on the idf values to eps * average_idf

Args:
nd (Dict): word -> number of documents with word
"""
# collect idf sum to calculate an average idf for epsilon value
idf_sum = 0
Expand All @@ -105,19 +148,29 @@ def _calc_idf(self, nd):
self.idf[word] = eps

def get_scores(self, query):
"""
The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
"""The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
this algorithm also adds a floor to the idf value of epsilon.
See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
:param query:
:return:

Args:
query (list): tokenized document

Returns:
Array: score of each token in a corpus
"""
score = np.zeros(self.corpus_size)
score = np.zeros(
self.corpus_size
) # array of score of each token in a document in corpus
doc_len = np.array(self.doc_len)
for q in query:
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
(q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
q_freq = np.array(
[(doc.get(q) or 0) for doc in self.doc_freqs]
) # term freq of a term 'q' in a document
score += (self.idf.get(q) or 0) * (
q_freq
* (self.k1 + 1)
/ (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))
)
return score

def get_batch_scores(self, query, doc_ids):
Expand All @@ -129,8 +182,11 @@ def get_batch_scores(self, query, doc_ids):
doc_len = np.array(self.doc_len)[doc_ids]
for q in query:
q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
(q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
score += (self.idf.get(q) or 0) * (
q_freq
* (self.k1 + 1)
/ (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))
)
return score.tolist()


Expand All @@ -153,8 +209,13 @@ def get_scores(self, query):
for q in query:
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \
(self.k1 + ctd + self.delta)
score += (
(self.idf.get(q) or 0)
* q_freq
* (self.k1 + 1)
* (ctd + self.delta)
/ (self.k1 + ctd + self.delta)
)
return score

def get_batch_scores(self, query, doc_ids):
Expand All @@ -167,8 +228,13 @@ def get_batch_scores(self, query, doc_ids):
for q in query:
q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \
(self.k1 + ctd + self.delta)
score += (
(self.idf.get(q) or 0)
* q_freq
* (self.k1 + 1)
* (ctd + self.delta)
/ (self.k1 + ctd + self.delta)
)
return score.tolist()


Expand All @@ -190,8 +256,11 @@ def get_scores(self, query):
doc_len = np.array(self.doc_len)
for q in query:
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
(self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
score += (self.idf.get(q) or 0) * (
self.delta
+ (q_freq * (self.k1 + 1))
/ (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq)
)
return score

def get_batch_scores(self, query, doc_ids):
Expand All @@ -203,8 +272,11 @@ def get_batch_scores(self, query, doc_ids):
doc_len = np.array(self.doc_len)[doc_ids]
for q in query:
q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
(self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
score += (self.idf.get(q) or 0) * (
self.delta
+ (q_freq * (self.k1 + 1))
/ (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq)
)
return score.tolist()


Expand Down