In [1]:
import pandas as pd

## Ingestion

In [2]:
df = pd.read_csv("../data/incidents_train.csv", index_col=False, dtype=str)
df = df.rename(columns={"Unnamed: 0": "id", "hazard-category": "hazard_category", "product-category": "product_category"})

In [3]:
documents = df.to_dict(orient='records')

In [4]:
import minsearch

In [5]:
index = minsearch.Index(
    text_fields=['title', 'hazard_category', 'product_category', 'hazard', 'product'],
    keyword_fields=['id']
)

In [6]:
index.fit(documents)

<minsearch.Index at 0x7341580669f0>

## Retrieval evaluation

In [7]:
df_question = pd.read_csv('../data/ground-truth-retrieval.tsv', delimiter="\t")

In [8]:
df_question.head()

Unnamed: 0,id,question
0,1013,What specific batches of vacuum-packed Organic...
1,1013,Are there any reported cases of botulism assoc...
2,1013,How did the company's poor or insufficient con...
3,1013,Has the relevant regulatory agency investigate...
4,1013,What steps does The Engine Shed plan to take t...


In [9]:
ground_truth = df_question.to_dict(orient='records')

In [10]:
ground_truth[0]

{'id': 1013,
 'question': 'What specific batches of vacuum-packed Organic Tofu were recalled by The Engine Shed?'}

In [11]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [12]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [13]:
from tqdm.auto import tqdm

In [14]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [int(d['id']) == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [15]:
df_question.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2335 non-null   int64 
 1   question  2335 non-null   object
dtypes: int64(1), object(1)
memory usage: 36.6+ KB


In [16]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/2335 [00:00<?, ?it/s]

{'hit_rate': 0.4055674518201285, 'mrr': 0.2652170218551375}

## Finding the best parameters

In [17]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [18]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [19]:
gt_val = df_validation.to_dict(orient='records')

In [20]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [21]:
param_ranges = {
    'title': (0.0, 3.0),
    'hazard_category': (0.0, 3.0),
    'product_category': (0.0, 3.0),
    'hazard': (0.0, 3.0),
    'product': (0.0, 3.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [22]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'title': 2.3537326911368632,
  'hazard_category': 1.1245522318536554,
  'product_category': 0.08895652299467893,
  'hazard': 0.20249449449528223,
  'product': 1.6219511686909902},
 0.44101190476190477)

In [23]:
def minsearch_improved(query):
    boost = {
        'title': 2.35,
        'hazard_category': 1.12,
        'product_category': 0.08,
        'hazard': 0.20,
        'product': 1.62
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/2335 [00:00<?, ?it/s]

{'hit_rate': 0.5087794432548179, 'mrr': 0.3745523605587848}