In [1]:
import pandas as pd

# Retrieval evaluation

In [2]:
df = pd.read_csv("../data/incidents_train.csv", index_col=False, dtype=str)
df = df.rename(columns={"Unnamed: 0": "id", "hazard-category": "hazard_category", "product-category": "product_category"})
documents = df.to_dict(orient="records")

In [3]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [4]:
documents[0]

{'id': '0',
 'year': '1994',
 'month': '1',
 'day': '7',
 'country': 'us',
 'title': 'Recall Notification: FSIS-024-94',
 'text': "Case Number: 024-94   \n            Date Opened: 07/01/1994   \n            Date Closed: 09/22/1994 \n    \n            Recall Class:  1   \n            Press Release (Y/N):  Y  \n    \n            Domestic Est. Number:  05893  P   \n              Name:  GERHARD'S NAPA VALLEY SAUSAGE\n    \n            Imported Product (Y/N):  N       \n            Foreign Estab. Number:  N/A\n    \n            City:  NAPA    \n            State:  CA   \n            Country:  USA\n    \n            Product:  SMOKED CHICKEN SAUSAGE\n    \n            Problem:  BACTERIA   \n            Description: LISTERIA\n    \n            Total Pounds Recalled:  2,894   \n            Pounds Recovered:  2,894",
 'hazard_category': 'biological',
 'product_category': 'meat, egg and dairy products',
 'hazard': 'listeria monocytogenes',
 'product': 'smoked sausage'}

In [5]:
prompt_template = """
You emulate a user of a food hazard detection application.
Formulate 5 questions this user might ask based on a food-incident record. 
Make the questions specific to this record.

The record:

'title': {title}
'hazard_category': {hazard_category}
'product_category': {product_category}
'hazard': {hazard}
'product': {product}

Provide the output in parsable JSON based on the following schema:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [6]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    response = client.chat.completions.create(
        model='llama3.1',
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )

    return response.choices[0].message.content

In [7]:
from tqdm.auto import tqdm

In [8]:
import json

In [9]:
results = {}

In [10]:
for doc in tqdm(documents[240:300]): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw.split("```")[1].lstrip("json"))['questions']
    q_from_dict = []
    for q in questions:
        if isinstance(q, dict):
            if q['text']:
                q_from_dict.append(q['text'])

    with open("../data/generated_questions_100_2.tsv", "a+") as myfile:
        if len(q_from_dict)>0:
            myfile.write("\t".join([doc_id, "\t".join(q_from_dict)]))
        else:
            myfile.write("\t".join([doc_id, "\t".join(questions)]))
        myfile.write("\n")
    results[doc_id] = questions_raw

  0%|          | 0/60 [00:00<?, ?it/s]

In [11]:
df_final = pd.read_csv("../data/generated_questions_100_2.tsv", delimiter="\t", names=["id","q1","q2","q3","q4","q5"], header=None)

In [12]:
df_final.head()

Unnamed: 0,id,q1,q2,q3,q4,q5
0,200,What is the nature of the hazard found in the ...,Are there any other products from Pork Farms t...,How many glass fragments were detected in the ...,Were the affected products distributed only wi...,What is the estimated cost of the product with...
1,201,What is the nature of the allergen hazard asso...,Are eggs and products thereof commonly used as...,Has Confectionery Trading Company recalled any...,Can consumers who are allergic to eggs safely ...,What steps can I take to inform customers abou...
2,202,What is the nature of the hazard identified in...,How does the presence of eggs in milk chocolat...,Is there any information on the quantity of co...,Were any recalls issued as a result of this in...,What steps can I take to ensure compliance wit...
3,203,What is the specific type of heavy metal conta...,Is there any information available about the s...,Has any recall been initiated for this product...,Are there any similar incidents reported invol...,What are the potential health risks associated...
4,204,What is Sudan 4 and what makes it a chemical h...,Is palm oil commonly used in food products tha...,Can you provide more information about the una...,How does this incident affect the safety of ot...,What are the next steps for recalling or re-la...


In [13]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      102 non-null    int64 
 1   q1      102 non-null    object
 2   q2      102 non-null    object
 3   q3      102 non-null    object
 4   q4      102 non-null    object
 5   q5      102 non-null    object
dtypes: int64(1), object(5)
memory usage: 4.9+ KB


In [14]:
results = df_final.to_dict(orient='records')

In [15]:
results

[{'id': 200,
  'q1': 'What is the nature of the hazard found in the Scotch eggs and Savoury Bites products?',
  'q2': 'Are there any other products from Pork Farms that have been recalled due to foreign bodies?',
  'q3': 'How many glass fragments were detected in the affected batches of Precooked cooked pork meat products?',
  'q4': 'Were the affected products distributed only within the UK or also internationally?',
  'q5': 'What is the estimated cost of the product withdrawal and recall for Marks and Spencer and Somerfield?'},
 {'id': 201,
  'q1': 'What is the nature of the allergen hazard associated with Fabulously Fun Lolly Pops?',
  'q2': 'Are eggs and products thereof commonly used as ingredients in confectionery products like lollipops?',
  'q3': 'Has Confectionery Trading Company recalled any other products due to similar allergen hazards?',
  'q4': 'Can consumers who are allergic to eggs safely consume Fabulously Fun Lolly Pops?',
  'q5': 'What steps can I take to inform custo

In [16]:
final_results = []

for entry in results:
    doc_id=entry["id"]
    final_results.append((doc_id, entry["q1"].strip('"')))
    final_results.append((doc_id, entry["q2"].strip('"')))
    final_results.append((doc_id, entry["q3"].strip('"')))
    final_results.append((doc_id, entry["q4"].strip('"')))
    final_results.append((doc_id, entry["q5"].strip('"')))

In [17]:
final_results[0]

(200,
 'What is the nature of the hazard found in the Scotch eggs and Savoury Bites products?')

In [18]:
df_results = pd.DataFrame(final_results, columns=["id", "question"])

In [20]:
df_results.to_csv('../data/ground-truth-retrieval_100_2.tsv', sep="\t", index=False)