In [6]:
import hashlib
import pandas as pd
import json
from openai import OpenAI
import os
from dotenv import load_dotenv
from tqdm.auto import tqdm
import pickle
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed





In [7]:
path = '../Data_csvs/data_v1.csv'
df = pd.read_csv(path)
df

Unnamed: 0,id,topic,Question,Answer
0,1,cancer,What is (are) Non-Small Cell Lung Cancer ?,Key Points Non-small cell lung cancer is a dis...
1,2,cancer,Who is at risk for Non-Small Cell Lung Cancer? ?,Smoking is the major risk factor for non-small...
2,3,cancer,What are the symptoms of Non-Small Cell Lung C...,Signs of non-small cell lung cancer include a ...
3,4,cancer,How to diagnose Non-Small Cell Lung Cancer ?,Tests that examine the lungs are used to detec...
4,5,cancer,What is the outlook for Non-Small Cell Lung Ca...,Certain factors affect prognosis (chance of re...
...,...,...,...,...
15995,15996,Other,What is (are) COPD ?,COPD (chronic obstructive pulmonary disease) m...
15996,15997,Other,What is (are) Complex Regional Pain Syndrome ?,Complex regional pain syndrome (CRPS) is a chr...
15997,15998,Other,What is (are) Kidney Stones ?,A kidney stone is a solid piece of material th...
15998,15999,Other,What is (are) Meniere's Disease ?,Meniere's disease is a disorder of the inner e...


In [8]:
documents = df.to_dict(orient='records')

In [80]:
df.columns

Index(['id', 'topic', 'Question', 'Answer'], dtype='object')

## Genereting ids

In [9]:
## we depend on the content not the order
def generate_document_id(doc):
    combined = f"{doc['topic']}-{doc['Question']}-{doc['Answer'][:15]}"

    # Create MD5 hash object
    hash_object = hashlib.md5(combined.encode())

    # Get hexadecimal representation
    hash_hex = hash_object.hexdigest()

    # Return first 8 characters as document ID
    document_id = hash_hex[:10]

    return document_id

In [10]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

## Cheching if the ids are completely unique, we had hash collition: (Spoiler they are duplicates)

In [11]:
from collections import defaultdict

hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [13]:
len(hashes)

15953

In [12]:
for k, values in hashes.items ():
    if len(values) > 1:
        print(k, len(values) )

ffa3a8b71c 3
78cb73d0e8 2
bac6b4c5cc 3
402da42b54 2
4d9599c0a6 3
eb953c3a0a 2
071cee1adf 2
5024bacd64 2
d84d8c0b69 4
a8fc39ee4c 2
d57185cf28 2
98883b65de 2
d24a30cd61 2
754531c78f 2
a9d9c856a6 2
10fddff4bd 2
cbd3fe713f 2
f1c8b2d56f 2
8c49fa0b9a 2
ad5ef2f7f6 2
b1853c8b5a 2
8eddc22223 2
00f9762457 2
c0597e960a 2
7fe24cc54a 2
32aeb461e6 2
d4e7108828 2
710c85021c 2
6594c432e2 2
a7764efb5d 2
1054780676 2
a45c9fcfdd 2
a33a04683b 2
5b431f381b 2
8e48e57ab8 2
cba2706fdb 2
94c686fbe3 2
205a157d1c 2
d4e4cea242 2
368e4a94e5 2
647b7e3c64 2
246b0dfc78 2


In [8]:
hashes['ffa3a8b71c']

[{'id': 'ffa3a8b71c',
  'topic': 'cancer',
  'Question': 'What is (are) Endometrial Cancer ?',
  'Answer': 'Key Points Endometrial cancer is a disease in which malignant (cancer) cells form in the tissues of the endometrium. Endometrial cancer is the most common invasive cancer of the female reproductive system. Endometrial cancer is a disease in which malignant (cancer) cells form in the tissues of the endometrium. The endometrium is the lining of the uterus. The uterus is part of the female reproductive system. It is a hollow, pear-shaped, muscular organ in the pelvis, where a fetus grows. Cancer of the endometrium is different from cancer of the muscle of the uterus, which is called sarcoma of the uterus. See the PDQ summary on Uterine Sarcoma Treatment for more information. See the following PDQ summaries for more information about endometrial cancer: Endometrial Cancer Screening Endometrial Cancer Treatment Endometrial cancer is the most common invasive cancer of the female reprod

In [88]:
with open('documents-json-hash.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)


In [89]:
!head documents-json-hash.json

[
  {
    "id": "5e34bcbaa8",
    "topic": "cancer",
    "Question": "What is (are) Non-Small Cell Lung Cancer ?",
    "Answer": "Key Points Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung. There are several types of non-small cell lung cancer. Smoking is the major risk factor for non-small cell lung cancer. Signs of non-small cell lung cancer include a cough that doesn't go away and shortness of breath. Tests that examine the lungs are used to detect (find), diagnose, and stage non-small cell lung cancer. Certain factors affect prognosis (chance of recovery) and treatment options. For most patients with non-small cell lung cancer, current treatments do not cure the cancer. Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung. The lungs are a pair of cone-shaped breathing organs in the chest. The lungs bring oxygen into the body as you breathe in. They release carbon diox

## Using a llm to create the questions to evaluate retrieval

In [13]:
prompt_template = """
You are helping create evaluation questions for a medical question-answering system based on authoritative NIH health information.

Generate 5 diverse questions that patients, medical students, or healthcare seekers might realistically ask,
where the provided medical record contains the answer.

REQUIREMENTS:
- Questions should be complete, natural, and varied in phrasing
- Each question should be answerable using the information in the record
- Use different perspectives (patient concerns, educational queries, practical questions)
- Avoid copying exact phrases from the record - rephrase naturally
- Questions should reflect real-world medical information seeking behavior

The medical record:

Topic: {topic}
Question: {Question}
Answer: {Answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3", "question4", "question5"]
""".strip()

In [14]:
doc = documents[5]
prompt = prompt_template.format(**doc)
print(prompt)

You are helping create evaluation questions for a medical question-answering system based on authoritative NIH health information.

Generate 5 diverse questions that patients, medical students, or healthcare seekers might realistically ask,
where the provided medical record contains the answer.

REQUIREMENTS:
- Questions should be complete, natural, and varied in phrasing
- Each question should be answerable using the information in the record
- Use different perspectives (patient concerns, educational queries, practical questions)
- Avoid copying exact phrases from the record - rephrase naturally
- Questions should reflect real-world medical information seeking behavior

The medical record:

Topic: cancer
Question: what research (or clinical trials) is being done for Non-Small Cell Lung Cancer ?
Answer: New types of treatment are being tested in clinical trials. This summary section describes treatments that are being studied in clinical trials. It may not mention every new treatment 

In [23]:
load_dotenv(override=True)  # ← Force reload

# Get the API key
api_key = os.getenv('OPENAI_API_KEY')
print(f"API Key found: {api_key is not None}")

API Key found: True


## Running ollama through docker AI hub

In [22]:
import requests

# Check model info
response = requests.get('http://127.0.0.1:12434/v1/chat/completions')
print(response.text)

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=12434): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x332c5ea50>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [33]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    # Use 127.0.0.1 instead of localhost
    url = "http://127.0.0.1:12434/v1/chat/completions"

    data = {
        "model": "granite-4.0-h-tiny",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ]
    }

    response = requests.post(url, json=data, timeout=120)
    response.raise_for_status()

    result = response.json()
    json_response = result["choices"][0]["message"]["content"]
    return json_response

In [29]:
import requests

# Test if service is available
print("🔍 Testing Docker Model Runner...")
try:
    response = requests.get('http://localhost:12434/')
    print("✅ Docker Model Runner is running!")
    print(response.text[:200])
except Exception as e:
    print(f"❌ Not running: {e}")

# Use OpenAI-compatible endpoint
url = "http://localhost:12434/engines/llama.cpp/v1/chat/completions"

data = {
    "model": "ai/granite-4.0-h-tiny:latest",
    "messages": [
        {
            "role": "user",
            "content": prompt
        }
    ]
}

try:
    response = requests.post(url, json=data, timeout=60)
    response.raise_for_status()

    result = response.json()
    print("\n✅ Chat successful!")
    print(result["choices"][0]["message"]["content"])

except Exception as e:
    print(f"\n❌ Error: {e}")

🔍 Testing Docker Model Runner...
❌ Not running: HTTPConnectionPool(host='localhost', port=12434): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x33d771df0>: Failed to resolve 'localhost' ([Errno 8] nodename nor servname provided, or not known)"))
The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.

❌ Error: HTTPConnectionPool(host='localhost', port=12434): Max retries exceeded with url: /engines/llama.cpp/v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x33d771010>: Failed to resolve 'localhost' ([Errno 8] nodename nor servname provided, or not known)"))


### To slow so we move to ollama

In [12]:
url = "http://localhost:11434/api/generate"

data = {
    "model": "gpt-oss:20b",
    "prompt": prompt,
    "stream": False
}

try:
    response = requests.post(url, json=data, timeout=60)
    response.raise_for_status()

    result = response.json()
    print("\n✅ Ollama is working!")
    print(result["response"])

except Exception as e:
    print(f"\n❌ Error: {e}")


❌ Error: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/generate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11e7db4a0>: Failed to establish a new connection: [Errno 61] Connection refused'))


In [24]:
api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=api_key,
)


In [27]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='openai/gpt-oss-120b',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response


# def generate_questions(doc):
#     prompt = prompt_template.format(**doc)
#
#     # OpenAI-compatible endpoint
#     url = "http://localhost:12434/engines/llama.cpp/v1/chat/completions"
#
#     data = {
#         "model": "ai/granite-4.0-h-tiny:latest",
#         "messages": [
#             {
#                 "role": "user",
#                 "content": prompt
#             }
#         ]
#     }
#
#     response = requests.post(url, json=data, timeout=120)
#     response.raise_for_status()
#
#     result = response.json()
#     json_response = result["choices"][0]["message"]["content"]
#     return json_response

In [28]:
generate_questions(documents[0])

'["What exactly is non‑small cell lung cancer and how does it differ from other lung cancers?","Which factors increase my risk of developing non‑small cell lung cancer, and what symptoms should I watch for?","Can you explain the main subtypes of non‑small cell lung cancer and how they are identified under a microscope?","How is non‑small cell lung cancer diagnosed and staged, and what tests are typically used?","What are the current treatment goals for patients with non‑small cell lung cancer, and why is cure rarely achieved?"]'

In [None]:
results= {}
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

# results = {}
# max_workers = 4
#
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
#     future_to_doc = {
#         executor.submit(generate_questions, doc): doc
#         for doc in documents
#         if doc['id'] not in results
#     }
#
#     # Process completed tasks with progress bar
#     for future in tqdm(as_completed(future_to_doc), total=len(future_to_doc)):
#         doc_id, questions = future.result()
#         results[doc_id] = questions
#
# print(f"✅ Processed {len(results)} documents")

  0%|          | 4/16000 [00:10<11:47:50,  2.66s/it]

In [25]:
results1 = {}
failed = []

# Calculate 6% of total documents
total_docs = len(documents)
start_index = int(total_docs * 0.06)

# Create progress bar starting at 6%
pbar = tqdm(total=total_docs, initial=start_index, desc="Generating questions")

for doc in documents:
    doc_id = doc['id']

    # Skip if already processed
    if doc_id in results1:
        pbar.update(1)
        continue

    try:
        questions = generate_questions(doc)
        results1[doc_id] = questions
        pbar.update(1)

    except Exception as e:
        print(f"\nFailed on doc {doc_id}: {e}")
        failed.append({"id": doc_id, "error": str(e)})
        pbar.update(1)
        continue

pbar.close()

print(f"\nCompleted: {len(results1)}/{total_docs}")
print(f"Failed: {len(failed)}")

Generating questions:   6%|▌         | 961/16000 [02:00<501:19:10, 120.00s/it]


Failed on doc 5e34bcbaa8: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 962/16000 [04:00<501:17:59, 120.01s/it]


Failed on doc c2845efbbc: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 963/16000 [06:00<501:16:08, 120.01s/it]


Failed on doc d1bc99107d: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 964/16000 [08:00<501:13:59, 120.01s/it]


Failed on doc abdbe99dd2: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 965/16000 [10:00<501:12:05, 120.01s/it]


Failed on doc 597a81526a: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 966/16000 [12:00<501:10:07, 120.01s/it]


Failed on doc 4979172719: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 967/16000 [14:00<501:08:15, 120.01s/it]


Failed on doc d563c7d596: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 968/16000 [16:00<501:06:12, 120.01s/it]


Failed on doc 02577e3169: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 969/16000 [18:00<501:04:18, 120.01s/it]


Failed on doc d11834721f: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 970/16000 [20:00<501:02:05, 120.01s/it]


Failed on doc e109719c38: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 971/16000 [22:00<501:00:07, 120.01s/it]


Failed on doc 6becabff6a: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 972/16000 [24:00<500:58:01, 120.01s/it]


Failed on doc 252662340d: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 973/16000 [26:00<500:56:16, 120.01s/it]


Failed on doc 4faee827b1: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 974/16000 [28:00<500:57:11, 120.02s/it]


Failed on doc ee6941d3af: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 975/16000 [30:00<500:54:53, 120.02s/it]


Failed on doc 0867df07b6: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 976/16000 [32:00<500:52:15, 120.02s/it]


Failed on doc d84a78215e: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 977/16000 [34:00<500:49:54, 120.02s/it]


Failed on doc 7cdefbf7d7: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 978/16000 [36:00<500:47:28, 120.01s/it]


Failed on doc ef17a8af15: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 979/16000 [38:00<500:45:17, 120.01s/it]


Failed on doc 72dbfb80d6: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 980/16000 [40:00<500:43:04, 120.01s/it]


Failed on doc ffa3a8b71c: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 981/16000 [42:00<500:41:01, 120.01s/it]


Failed on doc 5571edf550: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 982/16000 [44:00<500:38:56, 120.01s/it]


Failed on doc ffa3a8b71c: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 983/16000 [46:00<500:37:06, 120.01s/it]


Failed on doc 088527fd30: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 984/16000 [48:00<500:35:09, 120.01s/it]


Failed on doc de8d113e10: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 985/16000 [50:00<500:33:03, 120.01s/it]


Failed on doc 6382ed5025: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 986/16000 [52:00<500:30:48, 120.01s/it]


Failed on doc 58a1402850: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 987/16000 [3:02:06<10116:34:52, 2425.88s/it]


Failed on doc 621ffb7c2b: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 988/16000 [5:21:07<17516:31:41, 4200.61s/it]


Failed on doc 8f023665d0: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


Generating questions:   6%|▌         | 989/16000 [6:28:06<17288:06:13, 4146.10s/it]


Failed on doc 9cdbb2c4d0: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


KeyboardInterrupt: 

In [27]:
results3 = {}
failed = []

# Calculate 6% of total documents
total_docs = len(documents)
start_index = int(total_docs * 0.26)

# Create progress bar starting at 6%
pbar = tqdm(total=total_docs, initial=start_index, desc="Generating questions")

for doc in documents:
    doc_id = doc['id']

    # Skip if already processed
    if doc_id in results1:
        pbar.update(1)
        continue

    try:
        questions = generate_questions(doc)
        results1[doc_id] = questions
        pbar.update(1)

    except Exception as e:
        print(f"\nFailed on doc {doc_id}: {e}")
        failed.append({"id": doc_id, "error": str(e)})
        pbar.update(1)
        continue

pbar.close()

print(f"\nCompleted: {len(results1)}/{total_docs}")
print(f"Failed: {len(failed)}")


Generating questions:   6%|▌         | 989/16000 [6:31:19<3375:55:10, 809.63s/it]  

Generating questions:  26%|██▌       | 4161/16000 [02:00<394:39:02, 120.01s/it][A


Failed on doc 5e34bcbaa8: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4162/16000 [04:00<394:37:48, 120.01s/it][A


Failed on doc c2845efbbc: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4163/16000 [06:00<394:36:26, 120.01s/it][A


Failed on doc d1bc99107d: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4164/16000 [08:00<394:34:28, 120.01s/it][A


Failed on doc abdbe99dd2: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4165/16000 [10:00<394:32:12, 120.01s/it][A


Failed on doc 597a81526a: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4166/16000 [12:00<394:30:12, 120.01s/it][A


Failed on doc 4979172719: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4167/16000 [14:00<394:28:19, 120.01s/it][A


Failed on doc d563c7d596: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4168/16000 [16:00<394:26:09, 120.01s/it][A


Failed on doc 02577e3169: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4169/16000 [18:00<394:24:17, 120.01s/it][A


Failed on doc d11834721f: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4170/16000 [20:00<394:22:18, 120.01s/it][A


Failed on doc e109719c38: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4171/16000 [22:00<394:20:19, 120.01s/it][A


Failed on doc 6becabff6a: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4172/16000 [24:00<394:18:37, 120.01s/it][A


Failed on doc 252662340d: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4173/16000 [26:00<394:16:34, 120.01s/it][A


Failed on doc 4faee827b1: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4174/16000 [28:00<394:14:29, 120.01s/it][A


Failed on doc ee6941d3af: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4175/16000 [30:00<394:12:37, 120.01s/it][A


Failed on doc 0867df07b6: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)



Generating questions:  26%|██▌       | 4176/16000 [2:16:08<6571:30:39, 2000.80s/it][A


Failed on doc d84a78215e: HTTPConnectionPool(host='localhost', port=12434): Read timed out. (read timeout=120)


KeyboardInterrupt: 

In [None]:
print(f"Failed: {len(failed)}")
results3 = {}
failed = []

# Calculate 6% of total documents
total_docs = len(documents)
start_index = int(total_docs * 0.36)

# Create progress bar starting at 6%
pbar = tqdm(total=total_docs, initial=start_index, desc="Generating questions")

for doc in documents:
    doc_id = doc['id']

    # Skip if already processed
    if doc_id in results1:
        pbar.update(1)
        continue

    try:
        questions = generate_questions(doc)
        results1[doc_id] = questions
        pbar.update(1)

    except Exception as e:
        print(f"\nFailed on doc {doc_id}: {e}")
        failed.append({"id": doc_id, "error": str(e)})
        pbar.update(1)
        continue

pbar.close()

print(f"\nCompleted: {len(results1)}/{total_docs}")
print(f"Failed: {len(failed)}")

In [None]:
results3

In [None]:
import json
with open('results3_emergency_save.json', 'w') as f:
    json.dump(results3, f, indent=2)

In [None]:
results3 = {}
failed = []

# Calculate 6% of total documents
total_docs = len(documents)
start_index = int(total_docs * 0.66)

# Create progress bar starting at 6%
pbar = tqdm(total=total_docs, initial=start_index, desc="Generating questions")

for doc in documents:
    doc_id = doc['id']

    # Skip if already processed
    if doc_id in results1:
        pbar.update(1)
        continue

    try:
        questions = generate_questions(doc)
        results1[doc_id] = questions
        pbar.update(1)

    except Exception as e:
        print(f"\nFailed on doc {doc_id}: {e}")
        failed.append({"id": doc_id, "error": str(e)})
        pbar.update(1)
        continue

pbar.close()

print(f"\nCompleted: {len(results1)}/{total_docs}")
print(f"Failed: {len(failed)}")

In [20]:
print(results)



In [147]:
json.loads(results['5e34bcbaa8'])

['What are the early signs that might indicate the presence of non-small cell lung cancer?',
 'How does the type of cell affect the progression and treatment of non-small cell lung cancer?',
 'Why is smoking considered a major risk factor for developing non-small cell lung cancer?',
 'How do clinical trials contribute to the advancement of treatment for different stages of non-small cell lung cancer?',
 'Can you explain the differences between non-small cell lung cancer and small cell lung cancer?']

In [None]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [None]:
results['1f6520ca']


In [None]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [None]:
doc_index = {d['id']: d for d in documents}


In [None]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [None]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])


In [None]:
df.to_csv('ground-truth-data.csv', index=False)


In [None]:
!head ground-truth-data.csv
