In [None]:
import pandas as pd

# Retrieval evaluation

In [None]:
df = pd.read_csv("data/incidents_train.csv", index_col=False, dtype=str)
df = df.rename(columns={"Unnamed: 0": "id", "hazard-category": "hazard_category", "product-category": "product_category"})
documents = df.to_dict(orient="records")

In [None]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [None]:
documents[0]

In [72]:
prompt_template = """
You emulate a user of a food hazard detection application.
Formulate 5 questions this user might ask based on a food-incident record. 
Make the questions specific to this record.

The record:

'title': {title}
'hazard_category': {hazard_category}
'product_category': {product_category}
'hazard': {hazard}
'product': {product}

Provide the output in parsable JSON based on the following schema:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [80]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    #print(prompt)
    response = client.chat.completions.create(
        model='llama3.1',
        messages=[{"role": "user", "content": prompt}],
        temperature=-0.2
    )

    return response.choices[0].message.content

In [None]:
from tqdm.auto import tqdm

In [None]:
results = {}

In [None]:
import json

In [81]:
for doc in tqdm(documents[1031:1500]): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw.split("```")[1].lstrip("json"))
    with open("data/generated_questions_raw_llama_json.tsv", "a+") as myfile:
        myfile.write("\t".join([doc_id, "\t".join(questions['questions'])]))
        myfile.write("\n")
    results[doc_id] = questions_raw

  0%|          | 0/469 [00:00<?, ?it/s]

Here are 5 questions this user might ask based on the food-incident record, provided as a JSON object according to the specified schema:

```
{
  "questions": [
    "What is the specific product within semolina and products thereof that was withdrawn due to incorrect allergen information?",
    "Are there any other types of cereals or bakery products from Worsbrough Mill that may also contain gluten and have incorrect allergen labels?",
    "Has Worsbrough Mill issued a recall notice for these affected products, and if so, what is the current status of the recall?",
    "What steps can consumers take to identify whether they have purchased any of the withdrawn semolina or related products from Worsbrough Mill?",
    "Are there any regulatory actions being taken against Worsbrough Mill as a result of this incident, such as fines or penalties?"
  ]
}
```
{'questions': ['What is the specific product within semolina and products thereof that was withdrawn due to incorrect allergen informat

JSONDecodeError: Expecting ',' delimiter: line 17 column 14 (char 458)

In [3]:
import pandas as pd

In [60]:
df_final = pd.read_csv("../data/generated_questions_raw_llama_json.tsv", delimiter="\t", names=["id","q1","q2","q3","q4","q5"], header=None)

In [61]:
df_final.head()

Unnamed: 0,id,q1,q2,q3,q4,q5
0,1013,What specific batches of vacuum-packed Organic...,Are there any reported cases of botulism assoc...,How did the company's poor or insufficient con...,Has the relevant regulatory agency investigate...,What steps does The Engine Shed plan to take t...
1,1014,What type of food product requires recall due ...,Which food allergy is associated with this Col...,Are nuts the primary hazard associated with th...,Why did the authorities extend a recall relate...,Does Coles Brand Simply GlutenFree Nut Free Ce...
2,1015,What is the specific reason for the withdrawal...,Are there any other products from the same man...,Can I get a list of countries where these cere...,How can consumers who may have purchased these...,Is there an investigation underway to determin...
3,1016,What is the nature of the allergen (milk and p...,Is there a risk of cross-contamination with mi...,Can consumers who are lactose intolerant safel...,What steps has the manufacturer taken to preve...,Are there any updates on the recall status (e....
4,1017,What is the nature of the biological hazard in...,Is there a risk of cross-contamination with ot...,Can you provide more information about the vol...,Are there any specific guidelines or recommend...,Has the FDA taken any enforcement actions agai...


In [62]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467 entries, 0 to 466
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      467 non-null    int64 
 1   q1      467 non-null    object
 2   q2      467 non-null    object
 3   q3      467 non-null    object
 4   q4      467 non-null    object
 5   q5      467 non-null    object
dtypes: int64(1), object(5)
memory usage: 22.0+ KB


In [63]:
results = df_final.to_dict(orient='records')

In [64]:
results

[{'id': 1013,
  'q1': 'What specific batches of vacuum-packed Organic Tofu were recalled by The Engine Shed?',
  'q2': 'Are there any reported cases of botulism associated with consumption of the affected Organic Tofu?',
  'q3': "How did the company's poor or insufficient controls lead to a potential risk of botulism?",
  'q4': 'Has the relevant regulatory agency investigated this incident and taken any actions against The Engine Shed?',
  'q5': 'What steps does The Engine Shed plan to take to prevent similar incidents in the future?'},
 {'id': 1014,
  'q1': 'What type of food product requires recall due to nut contamination?',
  'q2': 'Which food allergy is associated with this Coles Supermarkets recall?',
  'q3': 'Are nuts the primary hazard associated with this extended recall notice?',
  'q4': 'Why did the authorities extend a recall related to cereal from Coles Supermarkets?',
  'q5': 'Does Coles Brand Simply GlutenFree Nut Free Cereal pose a risk due to its non-gluten-free and nu

In [65]:
final_results = []

for entry in results:
    doc_id=entry["id"]
    final_results.append((doc_id, entry["q1"].strip('"')))
    final_results.append((doc_id, entry["q2"].strip('"')))
    final_results.append((doc_id, entry["q3"].strip('"')))
    final_results.append((doc_id, entry["q4"].strip('"')))
    final_results.append((doc_id, entry["q5"].strip('"')))

In [66]:
final_results[0]

(1013,
 'What specific batches of vacuum-packed Organic Tofu were recalled by The Engine Shed?')

In [67]:
df_results = pd.DataFrame(final_results, columns=["id", "question"])

In [69]:
df_results.to_csv('../data/ground-truth-retrieval.tsv', delimiter="\t", index=False)

TypeError: NDFrame.to_csv() got an unexpected keyword argument 'delimiter'