In [1]:
import pandas as pd

# Retrieval evaluation

In [2]:
df = pd.read_csv("../data/incidents_train.csv", index_col=False, dtype=str)
df = df.rename(columns={"Unnamed: 0": "id", "hazard-category": "hazard_category", "product-category": "product_category"})
documents = df.to_dict(orient="records")

In [3]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [4]:
documents[0]

{'id': '0',
 'year': '1994',
 'month': '1',
 'day': '7',
 'country': 'us',
 'title': 'Recall Notification: FSIS-024-94',
 'text': "Case Number: 024-94   \n            Date Opened: 07/01/1994   \n            Date Closed: 09/22/1994 \n    \n            Recall Class:  1   \n            Press Release (Y/N):  Y  \n    \n            Domestic Est. Number:  05893  P   \n              Name:  GERHARD'S NAPA VALLEY SAUSAGE\n    \n            Imported Product (Y/N):  N       \n            Foreign Estab. Number:  N/A\n    \n            City:  NAPA    \n            State:  CA   \n            Country:  USA\n    \n            Product:  SMOKED CHICKEN SAUSAGE\n    \n            Problem:  BACTERIA   \n            Description: LISTERIA\n    \n            Total Pounds Recalled:  2,894   \n            Pounds Recovered:  2,894",
 'hazard_category': 'biological',
 'product_category': 'meat, egg and dairy products',
 'hazard': 'listeria monocytogenes',
 'product': 'smoked sausage'}

In [5]:
prompt_template = """
You emulate a user of a food hazard detection application.
Formulate 5 questions this user might ask based on a food-incident record. 
Make the questions specific to this record.

The record:

'title': {title}
'hazard_category': {hazard_category}
'product_category': {product_category}
'hazard': {hazard}
'product': {product}

Provide the output in parsable JSON based on the following schema:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [6]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    response = client.chat.completions.create(
        model='llama3.1',
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )

    return response.choices[0].message.content

In [7]:
from tqdm.auto import tqdm

In [8]:
import json

In [9]:
results = {}

In [10]:
for doc in tqdm(documents[300:400]): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw.split("```")[1].lstrip("json"))['questions']
    q_from_dict = []
    for q in questions:
        if isinstance(q, dict):
            if q['text']:
                q_from_dict.append(q['text'])

    with open("../data/generated_questions_300.tsv", "a+") as myfile:
        if len(q_from_dict)>0:
            myfile.write("\t".join([doc_id, "\t".join(q_from_dict)]))
        else:
            myfile.write("\t".join([doc_id, "\t".join(questions)]))
        myfile.write("\n")
    results[doc_id] = questions_raw

  0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
df_final = pd.read_csv("../data/generated_questions_300.tsv", delimiter="\t", names=["id","q1","q2","q3","q4","q5"], header=None)

In [12]:
df_final.head()

Unnamed: 0,id,q1,q2,q3,q4,q5
0,300,What type of other hazard was associated with ...,Was the recall related to a contamination issu...,Can you provide more information about the fir...,Did the recall affect any specific cuts or typ...,Are there any updates or follow-up actions tak...
1,301,What is the nature of the alleged labelling/mi...,Can you provide more information about the pro...,How does Woolworths Limited's quality control ...,What are the potential health risks associated...,Are there any similar incidents of labelling/m...
2,302,What was the nature of the labelling/misdescri...,Was the potato salad recalled due to this inci...,Can you provide more information about the No ...,"How did the company respond to the incident, a...",Are there any similar incidents involving Fran...
3,303,What is the specific allergen present in Gold ...,Are there any other products from Gold Medal S...,How does the presence of cereals containing gl...,Are there any similar seafood products on the ...,What steps can consumers take to avoid consumi...
4,304,What is the current status of the recall for P...,Are there any similar products from other manu...,Has the Australian government issued any guide...,Can I get a list of all the countries where Pr...,Are there any updates on the investigation int...


In [13]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      100 non-null    int64 
 1   q1      100 non-null    object
 2   q2      100 non-null    object
 3   q3      100 non-null    object
 4   q4      100 non-null    object
 5   q5      100 non-null    object
dtypes: int64(1), object(5)
memory usage: 4.8+ KB


In [14]:
results = df_final.to_dict(orient='records')

In [15]:
results

[{'id': 300,
  'q1': 'What type of other hazard was associated with the recalled Angus beef product?',
  'q2': 'Was the recall related to a contamination issue or a non-food safety concern?',
  'q3': 'Can you provide more information about the firm that issued the recall on June 15, 2007?',
  'q4': 'Did the recall affect any specific cuts or types of Angus beef products?',
  'q5': 'Are there any updates or follow-up actions taken by regulatory agencies regarding this recall?'},
 {'id': 301,
  'q1': 'What is the nature of the alleged labelling/misdescription fraud involving Home Brand Scorched Almonds Milk Chocolate?',
  'q2': "Can you provide more information about the product category 'cocoa and cocoa preparations, coffee and tea' and how it relates to this incident?",
  'q3': "How does Woolworths Limited's quality control process fail to detect such mislabelling incidents?",
  'q4': 'What are the potential health risks associated with consuming milk chocolates that have been mislabel

In [16]:
final_results = []

for entry in results:
    doc_id=entry["id"]
    final_results.append((doc_id, entry["q1"].strip('"')))
    final_results.append((doc_id, entry["q2"].strip('"')))
    final_results.append((doc_id, entry["q3"].strip('"')))
    final_results.append((doc_id, entry["q4"].strip('"')))
    final_results.append((doc_id, entry["q5"].strip('"')))

In [17]:
final_results[0]

(300,
 'What type of other hazard was associated with the recalled Angus beef product?')

In [18]:
df_results = pd.DataFrame(final_results, columns=["id", "question"])

In [20]:
df_results.to_csv('../data/ground-truth-retrieval_300.tsv', sep="\t", index=False)