In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/incidents_train.csv", index_col=False, dtype=str)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5082 entries, 0 to 5081
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        5082 non-null   object
 1   year              5082 non-null   object
 2   month             5082 non-null   object
 3   day               5082 non-null   object
 4   country           5082 non-null   object
 5   title             5082 non-null   object
 6   text              5082 non-null   object
 7   hazard-category   5082 non-null   object
 8   product-category  5082 non-null   object
 9   hazard            5082 non-null   object
 10  product           5082 non-null   object
dtypes: object(11)
memory usage: 436.9+ KB


In [4]:
df = df.rename(columns={"Unnamed: 0": "id", "hazard-category": "hazard_category", "product-category": "product_category"})

In [5]:
import minsearch

In [6]:
documents = df.to_dict(orient="records")

In [7]:
index = minsearch.Index(
    text_fields=['title', 'text', 'hazard_category', 'product_category', 'hazard', 'product'],
    keyword_fields=['id']
)

In [8]:
index.fit(documents)

<minsearch.Index at 0x746537b2d070>

In [9]:
def search(query):
    results = index.search(
        query=query,
        num_results=5
    )
    return results

In [10]:
search("main allergens in biscuits")

[{'id': '412',
  'year': '2009',
  'month': '5',
  'day': '12',
  'country': 'au',
  'title': "Haigh's Chocolate and Almond Cookies",
  'text': "PRA No. 2009/10796 Date published 12 May 2009 Product description Best before 4 August 2009. Plastic over-wrap. Package size 120g. The product has been on the marketplace for 2-3 weeks. Identifying features n/a What are the defects? Labelling—undeclared allegen What are the hazards? Allergic reaction What should consumers do? Return to any Haigh's store for a full refund. Call 1800 819 757 for further information. Supplier Haigh's Manufacturing Pty Ltd Traders who sold this product Haigh's Chocolate Stores Where the product was sold New South Wales South Australia Victoria Recall advertisements and supporting documentation No 20 - Attach B - 12 05 09.doc (38 KB) Coordinating agency Food Standards Australia New Zealand is the coordinating agency for this recall. Product category Confectionery × Close",
  'hazard_category': 'allergens',
  'produ

In [11]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [12]:
documents[0]

{'id': '0',
 'year': '1994',
 'month': '1',
 'day': '7',
 'country': 'us',
 'title': 'Recall Notification: FSIS-024-94',
 'text': "Case Number: 024-94   \n            Date Opened: 07/01/1994   \n            Date Closed: 09/22/1994 \n    \n            Recall Class:  1   \n            Press Release (Y/N):  Y  \n    \n            Domestic Est. Number:  05893  P   \n              Name:  GERHARD'S NAPA VALLEY SAUSAGE\n    \n            Imported Product (Y/N):  N       \n            Foreign Estab. Number:  N/A\n    \n            City:  NAPA    \n            State:  CA   \n            Country:  USA\n    \n            Product:  SMOKED CHICKEN SAUSAGE\n    \n            Problem:  BACTERIA   \n            Description: LISTERIA\n    \n            Total Pounds Recalled:  2,894   \n            Pounds Recovered:  2,894",
 'hazard_category': 'biological',
 'product_category': 'meat, egg and dairy products',
 'hazard': 'listeria monocytogenes',
 'product': 'smoked sausage'}

In [13]:
entry_template = """
 'title': {title}
 'hazard_category': {hazard_category}
 'product_category': {product_category}
 'hazard': {hazard}
 'product': {product}
""".strip()

def build_prompt(query, search_results):
    prompt_template = """
You're a food hazard detection assistant. Answer the QUESTION based on the CONTEXT from the food-incident reports.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='llama3.1',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [14]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
print(rag("Which allergen do biscuits contain?"))

In [None]:
print(rag("tell me the main biological hazard found in smoked sausage"))

In [None]:
print(rag("What types of foreign bodies were found in products?"))