### BM25 document indexing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.core.common import flatten
import numpy as np
import json
from tqdm import tqdm
import pickle
import os

### NLTK Preprocessing
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

"""
-----------------------------------------------------------------------------------------
Document indexes
-----------------------------------------------------------------------------------------
"""
def create_doc_index(corpus):
    """Creates doc index"""

    doc_ind = {}
    for i,doc in enumerate(corpus):
        doc_ID = "doc" + str(i)
        doc_ind[doc_ID] = doc

    return doc_ind

def create_doc_map(corpus):
    """creates dict mapping doc to integer"""

    doc_map={}
    for i,doc in enumerate(corpus):
        doc_map[i] = doc
    return doc_map

"""
Inverted dict lookup
"""
def get_key(dict, val):
    for key, value in dict.items():
        if val == value:
            return key
 
    return "key doesn't exist"

"""
-----------------------------------------------------------------------------------------
Term indexes
-----------------------------------------------------------------------------------------
"""
def create_vocab(corpus):
    """creates vocabulary of unique words in corpus"""

    vocab = []
    for doc in corpus:
        for sent in doc:
            for w in sent:
                if w not in vocab:
                    vocab.append(w)

    return vocab

def create_term_index(vocab):
    """creates term index mapping each term to an integer"""

    term_index={}
    for i,word in enumerate(vocab):
        term_index[word] = i

    return term_index


"""
-----------------------------------------------------------------------------------------
Building TD matrix
-----------------------------------------------------------------------------------------
"""
def fast_td(corpus, vocab):
    """ basically fast td array """

    total_terms = len(vocab)
    total_docs = len(corpus)
    td_matrix = np.zeros((total_terms, total_docs))
    doc_map = create_doc_map(corpus)
    term_map = create_term_index(vocab)

    for doc in corpus:
        j = get_key(doc_map, doc)
        for sent in doc:
            for word in sent:
                if word in vocab:
                    i = term_map[word]
                    td_matrix[i,j]+=1   

    return td_matrix 

"""
-----------------------------------------------------------------------------------------
Building BM25 matrix
-----------------------------------------------------------------------------------------
"""
def create_doc_lengths(corpus):
  """ list of doc lens """
  doc_lengths = []
  for doc in corpus:
    flattened_doc = list(flatten(doc))
    doc_len = len(flattened_doc)
    doc_lengths.append(doc_len)

  return doc_lengths

def cal_L_avg(doc_lengths):
  """ calculate average length of documents """
  L_avg = sum(doc_lengths)/len(doc_lengths)
  return L_avg


def bm25_matrix(corpus, k1, b):
  """
  creates bm matrix from given corpus with terms as rows and documents as columns
  returns bm25 matrix with each term-doc element weighted by it's BM25 measure
  """

  corpus = corpus
  vocab = create_vocab(corpus)
  doc_lengths = create_doc_lengths(corpus)
  total_docs = len(doc_lengths)
  L_avg = cal_L_avg(doc_lengths)
  td_array = fast_td(corpus, vocab)
  bm25_matrix = np.zeros(td_array.shape)

  for i in list(range(td_array.shape[0])):
    for j in list(range(td_array.shape[1])):
      bm25_matrix[i,j] = td_array[i,j]*(k1 + 1)/(td_array[i,j] + k1*(1-b + b*(doc_lengths[j]/L_avg)))
  
  ni = np.sum((td_array > 0), axis=1)
  term_idf = np.zeros(ni.shape)
  for i,ni in enumerate(list(ni)):
    term_idf[i] = np.log10((total_docs - ni + 0.5)/(ni + 0.5) + 1)
    

  bm25_matrix = np.multiply(bm25_matrix, term_idf.reshape(-1,1))
  return bm25_matrix

In [None]:
from native_preprocessor import native_prep
native_prep = native_prep()

def load_json_files():
    # Read documents
    docs_json = json.load(open(main_path + "cran_docs.json", 'r'))[:]
    doc_ids, docs = [item["id"] for item in docs_json], \
                            [item["body"] for item in docs_json]
    # Process documents
    processedDocs = native_prep.preprocessDocs(docs)
    # print(processedDocs[0])

    ### to resolve num1 discrepancy:
    doc_ids = list(np.array(doc_ids)-1)    
    return processedDocs, doc_ids

proc_docs, doc_ids = load_json_files()

def buildIndex(processedDocs, doc_ids):
        
    doc_index = create_doc_index(processedDocs)
    doc_vocab = create_vocab(processedDocs)
    # print(herb_vocab)
    doc_bm25 = bm25_matrix(processedDocs, 1.5, 0.75)    
    return doc_bm25, doc_vocab

doc_bm, doc_vocab = buildIndex(proc_docs, doc_ids)

"""
-----------------------------------------------------------------------------------------
Ranking
-----------------------------------------------------------------------------------------
"""
def ranking(query_tf, bm25_matrix, k):
    """produces the top k relevant documents per query"""  

    """Ranking by cosine sim and argsort, then picking top 3 relevant docs"""    
    cosine_sim_matrix = query_tf.T@bm25_matrix
    ind = np.argsort(cosine_sim_matrix, axis=1)
    
    best_match = np.fliplr(ind[:, -k:])
    ind_list = best_match.tolist()

    return ind_list



### Query input

In [3]:
def query_proc(query_corp, doc_vocab):
  
    query_td_arr = fast_td(query_corp, doc_vocab)
    # query_tfidf = np.multiply(query_td_arr, term_idf.reshape(-1,1))

    return query_td_arr

In [4]:
def handleCustomQuery(doc_vocab):
    """
    Take a custom query as input and return top five relevant documents
    """
    #Get query
    print("Enter query below")
    query = input()
    
    # Process documents
    processedQuery = native_prep.preprocessQueries([query])
    proc_query = query_proc(processedQuery, doc_vocab)
    doc_IDs_ordered = ranking(proc_query, doc_bm, 4)
    new_list = []
    for doc in doc_IDs_ordered[0]:
        new_list.append(doc+1)
    print(new_list)
    

In [5]:
handleCustomQuery(doc_vocab)

Enter query below
[254, 125, 140, 120]
