In [1]:
from tqdm.auto import tqdm
from glob import glob
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
ELASTIC_URL = 'http://localhost:9200' # 'http://localhost:9200'

In [4]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(ELASTIC_URL)

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "author": {"type": "text"},
            "title": {"type": "text"},
            "text": {"type": "text"},
            "category": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "book-reviews"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'book-reviews'})

In [5]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 158/158 [00:04<00:00, 34.13it/s]


In [6]:
documents[0]

{'id': 'B000FCJZ3G000',
 'author': 'T. Harv Eker',
 'title': 'Secrets of the Millionaire Mind: Mastering the Inner Game of Wealth',
 'category': 'bm',
 'text': 'This book is average at best.  A big commercial for his seminars, which are mentioned at least 100 times.  You never learn how to diagnose your own "money blueprint" because of course you have to pay to go to a seminar for that.  This was a big let down. Also there were no chapters so without natural breaks it was really hard to read.  There are a few decent lessons and highlights to take away, but much less useful than most other highly rated money books.'}

In [7]:
def elastic_search(query, category):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["text^3", "title^2", "author^2"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "category": category
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [8]:
elastic_search(
    query = "What books were written by Benjamin Hardy?",
    category = 'bm' # 'Business & Money'
)

[{'id': 'B09BTV53VD131',
  'author': 'Benjamin P. Hardy',
  'title': 'Be Your Future Self Now: The Science of Intentional Transformation',
  'category': 'bm',
  'text': "[[VIDEOID:b0af83c60a89ba48220269c6223326c2]] Have read every one of Dr. Hardy's books, this one ranks at the very top.  Have read over 1000 books in the productivity / self dev / business entrepreneur world, and this is probably in my top 5.  We all want to get to our goals and outcomes faster.  Most of us including me fall short.  If you follow what Dr. Hardy says in this gem, you will get to your goals and outcomes with a whole lot more certainty, faster than you ever thought was possible.  I read this five weeks ago, did everything Dr. Hardy told me to do in the book including the last step in the conclusion, and have never come close to having more gigantic opportunities come into my life because I was acting like my future self (5 years from now - even more energetic, happy, fulfilled, successful, fit) which allow

In [10]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [11]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [12]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], category=q['category'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 160/160 [00:00<00:00, 189.24it/s]


In [13]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [14]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

- hit-rate (recall)
- Mean Reciprocal Rank (mrr)

In [15]:
hit_rate(relevance_total), mrr(relevance_total)

(0.79375, 0.6626041666666665)

In [16]:


DATA_PATH = '../data/'

def fetch_documents(data_path=DATA_PATH):
    print("Fetching documents...")
    df = pd.DataFrame()
    data_path = data_path.rstrip('/') # prevent //
    qna_files = sorted(glob(data_path+'/book-reviews-*.csv'))
    for file_name in qna_files:
        df_ = pd.read_csv(file_name) #, index_col=False)
        print(f' adding {file_name}: {df_.shape[0]} record(s)')
        df = pd.concat([df, df_]) #,ignore_index=True)

    documents = df.to_dict(orient="records")
    print(f" Fetched {len(documents)} document(s)")
    return documents

documents = fetch_documents('../data')

Fetching documents...
 adding ../data/book-reviews-bm.csv: 158 record(s)
 adding ../data/book-reviews-hfd.csv: 35 record(s)
 adding ../data/book-reviews-sh.csv: 17 record(s)
 adding ../data/book-reviews-sm.csv: 51 record(s)
 Fetched 261 document(s)


In [17]:
import minsearch

index = minsearch.Index(
    text_fields=[
            "author",
            "title",
            "text",
            "category",
        ],
    keyword_fields=["id"]
)

index.fit(documents)

<minsearch.Index at 0x7dd44c25d3a0>

In [18]:
def minsearch_search(query, category, boost=None):
    # boost = {'text': 3.0, 'title': 2.0, 'author': 2.0}
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={'category': category},
        boost_dict=boost,
        num_results=5
    )

    return results

In [19]:
minsearch_search(
    query = "What books were written by Benjamin Hardy?",
    category = 'Business & Money'
)[0:2]

minsearch_search(
    query = "What books were written by Steve Brusatte?",
    category = 'Science & Math'
)[0:2]

minsearch_search(
    query = "What books were written by Michael Matthews?",
    category = 'Health, Fitness & Dieting'
)[0:2]

minsearch_search(
    query = "What books were written by Steven Pressfield?",
    category = 'Self-Help'
)[0:2]

[{'id': 'B007A4SDCG001',
  'parent_asin': 'B007A4SDCG',
  'author': 'Steven Pressfield',
  'title': 'The War of Art',
  'category': 'sh',
  'publication_year': 2011,
  'rating': 5.0,
  'helpful_vote': 44,
  'text': 'Published in 2002, The War of Art is about breaking through blocks that hold you back and disciplining yourself to do important work and pursue your calling. This book is aimed at creative artists, especially writers. But it contains lessons for anyone pursuing growth in any area who feels blocked by self-sabotaging behavior or failure to act.  The premise is that there are hidden forces working against us, leading to inaction, distraction, procrastination, complacency, fear and self-destructive behavior. The author calls these forces "resistance." They exist within us as negative tendencies or programming in our subconscious. The resistance may also be outside us, as in associations with the wrong people, or addiction to distractions (which today includes social media).  R

In [20]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], category=q['category'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 160/160 [00:00<00:00, 290.75it/s]


In [21]:
hit_rate(relevance_total), mrr(relevance_total)

(0.71875, 0.5032291666666667)

Compare with ES results:
```
(0.79375, 0.6626041666666665)
```

In [23]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [24]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['category']))

100%|██████████| 160/160 [00:00<00:00, 241.12it/s]


{'hit_rate': 0.79375, 'mrr': 0.6626041666666665}

In [25]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['category']))

100%|██████████| 160/160 [00:00<00:00, 293.63it/s]


{'hit_rate': 0.71875, 'mrr': 0.5032291666666667}

## Finding the best parameters

In [27]:
n_gt = df_ground_truth.shape[0]
df_validation = df_ground_truth[:n_gt//2]
df_test = df_ground_truth[n_gt//2:]

In [28]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [29]:
gt_val = df_validation.to_dict(orient='records')
gt_test = df_test.to_dict(orient='records')

In [30]:
param_ranges = {
    'author': (0.0, 3.0),
    'title': (0.0, 3.0),
    'text': (0.0, 3.0),
}

CATEGORY = 'bm' # 'Business & Money'

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], CATEGORY, boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [31]:
simple_optimize(param_ranges, objective, n_iterations=20)

100%|██████████| 80/80 [00:00<00:00, 296.66it/s]
100%|██████████| 80/80 [00:00<00:00, 285.31it/s]
100%|██████████| 80/80 [00:00<00:00, 301.05it/s]
100%|██████████| 80/80 [00:00<00:00, 291.66it/s]
100%|██████████| 80/80 [00:00<00:00, 293.39it/s]
100%|██████████| 80/80 [00:00<00:00, 263.08it/s]
100%|██████████| 80/80 [00:00<00:00, 302.10it/s]
100%|██████████| 80/80 [00:00<00:00, 298.72it/s]
100%|██████████| 80/80 [00:00<00:00, 275.19it/s]
100%|██████████| 80/80 [00:00<00:00, 299.74it/s]
100%|██████████| 80/80 [00:00<00:00, 299.75it/s]
100%|██████████| 80/80 [00:00<00:00, 301.00it/s]
100%|██████████| 80/80 [00:00<00:00, 300.41it/s]
100%|██████████| 80/80 [00:00<00:00, 302.60it/s]
100%|██████████| 80/80 [00:00<00:00, 301.02it/s]
100%|██████████| 80/80 [00:00<00:00, 295.86it/s]
100%|██████████| 80/80 [00:00<00:00, 290.79it/s]
100%|██████████| 80/80 [00:00<00:00, 295.45it/s]
100%|██████████| 80/80 [00:00<00:00, 300.86it/s]
100%|██████████| 80/80 [00:00<00:00, 286.98it/s]


({'author': 1.2235875785555366,
  'title': 0.16262322287733988,
  'text': 2.841469158263197},
 0.6364583333333335)

In [32]:
def minsearch_improved(query, category=CATEGORY):
    boost = {
        'author': 1.22,
        'title': 0.16,
        'text': 2.84,
    }

    results = index.search(
        query=query,
        filter_dict={'category': category},
        boost_dict=boost,
        num_results=5
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

100%|██████████| 160/160 [00:00<00:00, 291.47it/s]


{'hit_rate': 0.85625, 'mrr': 0.6561458333333332}

In [33]:
def minsearch_improved(query, category=CATEGORY):
    boost = {
        'author': 1.87,
        'title': 0.26,
        'text': 1.65,
    }

    results = index.search(
        query=query,
        filter_dict={'category': category},
        boost_dict=boost,
        num_results=5
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

100%|██████████| 160/160 [00:00<00:00, 295.49it/s]


{'hit_rate': 0.8625, 'mrr': 0.6561458333333332}

before optimization: (0.8, 0.5859374999999999)

Compare with ES results:
```
(0.79375, 0.6643749999999999)
```

In [34]:
evaluate(gt_test, lambda q: minsearch_improved(q['question']))

100%|██████████| 80/80 [00:00<00:00, 269.53it/s]


{'hit_rate': 0.8625, 'mrr': 0.6814583333333335}

In [35]:
def elastic_search(query, category, boost=None):
    # boost = {'text': 3, 'title': 2, 'author': 2}
    if boost is None:
        boost_te = ''
        boost_au = ''
        boost_ti = ''
    else:
        boost_te = f"^{boost['text']}" if boost['text'] else ''
        boost_au = f"^{boost['author']}" if boost['author'] else ''
        boost_ti = f"^{boost['title']}" if boost['title'] else ''
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": [f"text{boost_te}", f"title{boost_ti}", f"author{boost_au}"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "category": category
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [36]:
elastic_search(
    query = "What books were written by Benjamin Hardy?",
    category = 'bm' # 'Business & Money'
)

[{'id': 'B09BTV53VD131',
  'author': 'Benjamin P. Hardy',
  'title': 'Be Your Future Self Now: The Science of Intentional Transformation',
  'category': 'bm',
  'text': "[[VIDEOID:b0af83c60a89ba48220269c6223326c2]] Have read every one of Dr. Hardy's books, this one ranks at the very top.  Have read over 1000 books in the productivity / self dev / business entrepreneur world, and this is probably in my top 5.  We all want to get to our goals and outcomes faster.  Most of us including me fall short.  If you follow what Dr. Hardy says in this gem, you will get to your goals and outcomes with a whole lot more certainty, faster than you ever thought was possible.  I read this five weeks ago, did everything Dr. Hardy told me to do in the book including the last step in the conclusion, and have never come close to having more gigantic opportunities come into my life because I was acting like my future self (5 years from now - even more energetic, happy, fulfilled, successful, fit) which allow

In [44]:
param_ranges = {
    'author': (0, 10),
    'title': (0, 10),
    'text': (0, 10),
}

CATEGORY = 'bm' # 'Business & Money'

def objective(boost_params):
    def search_function(q):
        return elastic_search(q['question'], CATEGORY, boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [38]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [48]:
simple_optimize(param_ranges, objective, n_iterations=20)

100%|██████████| 80/80 [00:00<00:00, 505.82it/s]
100%|██████████| 80/80 [00:00<00:00, 460.71it/s]
100%|██████████| 80/80 [00:00<00:00, 459.02it/s]
100%|██████████| 80/80 [00:00<00:00, 524.86it/s]
100%|██████████| 80/80 [00:00<00:00, 526.60it/s]
100%|██████████| 80/80 [00:00<00:00, 534.82it/s]
100%|██████████| 80/80 [00:00<00:00, 482.30it/s]
100%|██████████| 80/80 [00:00<00:00, 479.88it/s]
100%|██████████| 80/80 [00:00<00:00, 505.12it/s]
100%|██████████| 80/80 [00:00<00:00, 477.06it/s]
100%|██████████| 80/80 [00:00<00:00, 515.30it/s]
100%|██████████| 80/80 [00:00<00:00, 507.54it/s]
100%|██████████| 80/80 [00:00<00:00, 495.84it/s]
100%|██████████| 80/80 [00:00<00:00, 487.67it/s]
100%|██████████| 80/80 [00:00<00:00, 464.95it/s]
100%|██████████| 80/80 [00:00<00:00, 512.92it/s]
100%|██████████| 80/80 [00:00<00:00, 540.90it/s]
100%|██████████| 80/80 [00:00<00:00, 541.06it/s]
100%|██████████| 80/80 [00:00<00:00, 537.49it/s]
100%|██████████| 80/80 [00:00<00:00, 530.80it/s]


({'author': 4, 'title': 6, 'text': 6}, 0.6212500000000001)

In [50]:
def elastic_search_improved(query, category): #, boost=None):
    # boost = {'text': 3, 'title': 2, 'author': 2}
    # if boost is None:
    #     boost_te = ''
    #     boost_au = ''
    #     boost_ti = ''
    # else:
    #     boost_te = f"^{boost['text']}" if boost['text'] else ''
    #     boost_au = f"^{boost['author']}" if boost['author'] else ''
    #     boost_ti = f"^{boost['title']}" if boost['title'] else ''
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        # "fields": [f"text{boost_te}", f"title{boost_ti}", f"author{boost_au}"],
                        "fields": [f"text^6", f"title^6", f"author^4"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "category": category
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [51]:
evaluate(gt_test, lambda q: elastic_search_improved(q['question'], CATEGORY))

100%|██████████| 80/80 [00:00<00:00, 530.17it/s]


{'hit_rate': 0.825, 'mrr': 0.72}

without boost

(0.79375, 0.6626041666666665)