In [1]:
## Importing necessary modules
# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
import string
import pickle
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords


## Function to preprocess sentence
def preprocess_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    porter = PorterStemmer()
    
    sentence = sentence.split(' ')
    
    for i, word in enumerate(sentence):
        word = word.lower()
        word = "".join([char for char in word if char not in string.punctuation])
        word = lemmatizer.lemmatize(word)
        word = porter.stem(word)
        sentence[i] = word
    
    sentence = [word for word in sentence if word not in stopwords.words('english')]
    sentence = " ".join(sentence)
    
    return sentence

In [2]:
## Importing book information (Title, ISBN, Description, etc.) extracted from Google BooksAPI
with open('book_list.pkl', 'rb') as input_file:
    book_list = pickle.load(input_file)
    
## Preprocessing description
desc_isbn = {}

for k, v in book_list.items():
    desc = preprocess_sentence(v['description'])
    desc_isbn[desc] = k

In [3]:
## Initializing a query
query_string = '''the 27th book in the jack reacher series. reacher goes after a killer but is unaware of the bigger implications.NEW YORK TIMES BESTSELLER • The gripping new Jack Reacher thriller from the #1 New York Times bestselling authors Lee Child and Andrew Child “No Plan B is not to be missed'''

In [4]:
# https://pypi.org/project/rank-bm25/

corpus = list(desc_isbn.keys())

tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

query = preprocess_sentence(query_string)
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
max_index = np.argmax(doc_scores)
top_n = bm25.get_top_n(tokenized_query, corpus, n=5)

for i, idx in enumerate(top_n):
    print('Rank', i + 1, ":", book_list[desc_isbn[idx]].get('title'), '-', book_list[desc_isbn[idx]].get('author'), '-', desc_isbn[idx])

Rank 1 : NO PLAN B - Lee Child and Andrew Child - 9781984818577
Rank 2 : THE SECRET - Lee Child and Andrew Child - 9781984818584
Rank 3 : THE SECRET - Lee Child and Andrew Child - 9780593452806
Rank 4 : FIVE SURVIVE - Holly Jackson - 9780593374160
Rank 5 : CONSTRUCTION SITE ON CHRISTMAS NIGHT - Sherri Duskey Rinker. - 9781452139111
