In [42]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-08-30 16:09:32--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.1’


2024-08-30 16:09:32 (44.5 MB/s) - ‘minsearch.py.1’ saved [3832/3832]



In [43]:
import pandas as pd

## Ingestion

In [44]:
df = pd.read_csv("../data/incidents_train.csv", index_col=False, dtype=str)
df = df.rename(columns={"Unnamed: 0": "id", "hazard-category": "hazard_category", "product-category": "product_category"})

In [45]:
documents = df.to_dict(orient='records')

In [64]:
# Filter only documents that ground truth been created for
documents = documents[1013:1479]

In [46]:
import minsearch

In [66]:
index = minsearch.Index(
    #text_fields=['id', 'year', 'month', 'day', 'country', 'title', 'text', 'hazard_category', 'product_category', 'hazard', 'product'],
    #text_fields=['country', 'title', 'text', 'hazard_category', 'product_category', 'hazard', 'product'],
    text_fields=['title', 'hazard_category', 'product_category', 'hazard', 'product'],
    keyword_fields=['id']
)

In [67]:
index.fit(documents)

<minsearch.Index at 0x7f30dc046e30>

## RAG flow

In [68]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [69]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [70]:
prompt_template = """
You're a food hazard detection assistant. Answer the QUESTION based on the CONTEXT from the food-incident reports.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()


entry_template = """
 'title': {title}
 'hazard_category': {hazard_category}
 'product_category': {product_category}
 'hazard': {hazard}
 'product': {product}
""".strip()


def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [110]:
def llm(prompt, model='llama3.1'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=-0.2
    )
    
    return response.choices[0].message.content

In [54]:
def rag(query, model='llama3.1'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [72]:
question = 'What are the products that contain listeria monocytogenes?'
answer = rag(question)
print(answer)

Based on the provided context, there are no specific reports of listeria monocytogenes being detected in "Double Cream". However, there is a mention of a recall for other dairy products.

There are several mentions of listeria monocytogenes with different product categories:

* Chicken based products (4 separate recalls)
* Precooked beef meat products
* Pig meat - pork
* Raw goat milk cheese

Unfortunately, none of the provided reports include specific information about products that contain listeria monocytogenes.


## Retrieval evaluation

In [73]:
df_question = pd.read_csv('../data/ground-truth-retrieval.tsv', delimiter="\t")

In [74]:
df_question.head()

Unnamed: 0,id,question
0,1013,What specific batches of vacuum-packed Organic...
1,1013,Are there any reported cases of botulism assoc...
2,1013,How did the company's poor or insufficient con...
3,1013,Has the relevant regulatory agency investigate...
4,1013,What steps does The Engine Shed plan to take t...


In [75]:
ground_truth = df_question.to_dict(orient='records')

In [76]:
ground_truth[0]

{'id': 1013,
 'question': 'What specific batches of vacuum-packed Organic Tofu were recalled by The Engine Shed?'}

In [77]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [78]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [79]:
from tqdm.auto import tqdm

In [80]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [int(d['id']) == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [81]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/2335 [00:00<?, ?it/s]

{'hit_rate': 0.6124197002141327, 'mrr': 0.41243992386390693}

## Finding the best parameters

In [82]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [83]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [84]:
gt_val = df_validation.to_dict(orient='records')

In [85]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [86]:
param_ranges = {
    'title': (0.0, 3.0),
    'hazard_category': (0.0, 3.0),
    'product_category': (0.0, 3.0),
    'hazard': (0.0, 3.0),
    'product': (0.0, 3.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [87]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'title': 2.9956641322868047,
  'hazard_category': 0.41403862613871456,
  'product_category': 1.9716156627484709,
  'hazard': 0.9218557425650392,
  'product': 0.6469289358889216},
 0.582)

In [189]:
def minsearch_improved(query):
    boost = {
        'title': 2.99,
        'hazard_category': 0.41,
        'product_category': 1.97,
        'hazard': 0.92,
        'product': 0.64
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/2335 [00:00<?, ?it/s]

{'hit_rate': 0.6672376873661671, 'mrr': 0.47780649875938963}

## RAG evaluation

In [89]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [90]:
len(ground_truth)

2335

In [95]:
ground_truth[0]

{'id': 1013,
 'question': 'What specific batches of vacuum-packed Organic Tofu were recalled by The Engine Shed?'}

In [93]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)

In [94]:
print(answer_llm)

None of the recalled products are vacuum-packed Organic Tofu. However, there is a mention of "The Engine Shed recalls various batches of their vacuum-packed Organic Tofu due to a potential risk of botulism" in the context provided.

Based on this information, I'm unable to provide specific batch numbers for these recalled Organic Tofus as that particular product's recall is only mentioned briefly and not detailed further.


In [96]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What specific batches of vacuum-packed Organic Tofu were recalled by The Engine Shed?
Generated Answer: None of the recalled products are vacuum-packed Organic Tofu. However, there is a mention of "The Engine Shed recalls various batches of their vacuum-packed Organic Tofu due to a potential risk of botulism" in the context provided.

Based on this information, I'm unable to provide specific batch numbers for these recalled Organic Tofus as that particular product's recall is only mentioned briefly and not detailed further.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Re

In [97]:
import json

In [98]:
df_sample = df_question.sample(n=200, random_state=1)

In [99]:
sample = df_sample.to_dict(orient='records')

In [108]:
sample[0]

{'id': 1319,
 'question': 'How will consumers be notified about this recall, and what steps should they take to return the affected products?'}

In [115]:
evaluations = {}

In [None]:
for record in tqdm(sample):
    id = record['id']

    if id in evaluations:
        continue
    
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    print(evaluation)
    if "```" in evaluation:
        evaluation = evaluation.split("```")[1].lstrip("json")
    elif "\n\n" in evaluation:
        evaluation = evaluation.split("\n\n")[1]
    evaluation = json.loads(evaluation)

    evaluations[id] = {
        'id': id,
        'question': question,
        'answer_llm': answer_llm,
        'evaluation': evaluation
    }

In [146]:
evaluations[1319]

{'id': 1319,
 'question': 'How will consumers be notified about this recall, and what steps should they take to return the affected products?',
 'answer_llm': "Based on the context provided, it appears that there are multiple recalls of beef products due to misbranding and undeclared allergens. However, I will focus on the specific recall mentioned in the context:\n\n**California Firm Recalls Beef Products Due to Misbranding and Undeclared Allergen**\n\n**Hazard:** Milk and products thereof\n\n**Product:** Beef products\n\nTo answer your question:\n\nConsumers will be notified about this recall through a public notice, likely posted on the website of the relevant regulatory agency (such as the FDA or USDA). The notice may also be disseminated through other channels, such as social media, news outlets, and point-of-sale notifications.\n\nAs for what steps consumers should take to return the affected products:\n\nConsumers who have purchased the recalled beef products should check their 

In [195]:
df_eval = pd.DataFrame(evaluations, columns=['id', 'question', 'relevance', 'answer', 'explanation'])

In [196]:
for k,v in evaluations.items():
    new_entry = {
        'id': v['id'],
        'question': v['question'],
        'relevance': v['evaluation']['Relevance'],
        'answer': v['answer_llm'],
        'explanation': v['evaluation']['Explanation']
    }
    df1 = pd.DataFrame(new_entry, index=[0])
    df_eval = pd.concat([df_eval, df1])

In [132]:
# df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

# df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
# df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

# df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
# df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

# del df_eval['record']
# del df_eval['evaluation']

In [198]:
df_eval.reset_index(drop=True, inplace=True)

In [199]:
df_eval.to_csv('../data/rag-evaluationa-llama3.1.tsv', sep="\t", index=False)