In [1]:
import hashlib
import pandas as pd
import json
from openai import OpenAI
import os
from dotenv import load_dotenv
from tqdm.auto import tqdm
import pickle
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

import minsearch
from fastembed import TextEmbedding




  from .autonotebook import tqdm as notebook_tqdm


# Generating dataset

In [118]:
path = '../Data_csvs/data_v1.csv'
df = pd.read_csv(path)
df = df.rename(columns={'id': 'doc_id'})
df

Unnamed: 0,doc_id,topic,Question,Answer
0,1,cancer,What is (are) Non-Small Cell Lung Cancer ?,Key Points Non-small cell lung cancer is a dis...
1,2,cancer,Who is at risk for Non-Small Cell Lung Cancer? ?,Smoking is the major risk factor for non-small...
2,3,cancer,What are the symptoms of Non-Small Cell Lung C...,Signs of non-small cell lung cancer include a ...
3,4,cancer,How to diagnose Non-Small Cell Lung Cancer ?,Tests that examine the lungs are used to detec...
4,5,cancer,What is the outlook for Non-Small Cell Lung Ca...,Certain factors affect prognosis (chance of re...
...,...,...,...,...
15995,15996,Other,What is (are) COPD ?,COPD (chronic obstructive pulmonary disease) m...
15996,15997,Other,What is (are) Complex Regional Pain Syndrome ?,Complex regional pain syndrome (CRPS) is a chr...
15997,15998,Other,What is (are) Kidney Stones ?,A kidney stone is a solid piece of material th...
15998,15999,Other,What is (are) Meniere's Disease ?,Meniere's disease is a disorder of the inner e...


In [117]:
df

In [119]:
documents = df.to_dict(orient='records')

In [120]:
df.columns

Index(['doc_id', 'topic', 'Question', 'Answer'], dtype='object')

## Genereting ids

In [121]:
## we depend on the content not the order
def generate_document_id(doc):
    combined = f"{doc['topic']}-{doc['Question']}-{doc['Answer'][:15]}"

    # Create MD5 hash object
    hash_object = hashlib.md5(combined.encode())

    # Get hexadecimal representation
    hash_hex = hash_object.hexdigest()

    # Return first 8 characters as document ID
    document_id = hash_hex[:10]

    return document_id

In [122]:
for doc in documents:
    doc['doc_id'] = generate_document_id(doc)

## Cheching if the ids are completely unique, we had hash collition: (Spoiler they are duplicates)

In [123]:
documents

[{'doc_id': '5e34bcbaa8',
  'topic': 'cancer',
  'Question': 'What is (are) Non-Small Cell Lung Cancer ?',
  'Answer': "Key Points Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung. There are several types of non-small cell lung cancer. Smoking is the major risk factor for non-small cell lung cancer. Signs of non-small cell lung cancer include a cough that doesn't go away and shortness of breath. Tests that examine the lungs are used to detect (find), diagnose, and stage non-small cell lung cancer. Certain factors affect prognosis (chance of recovery) and treatment options. For most patients with non-small cell lung cancer, current treatments do not cure the cancer. Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung. The lungs are a pair of cone-shaped breathing organs in the chest. The lungs bring oxygen into the body as you breathe in. They release carbon dioxide, a waste pr

In [124]:
from collections import defaultdict

hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['doc_id']
    hashes[doc_id].append(doc)

In [125]:
len(hashes)

15953

In [103]:
for k, values in hashes.items ():
    if len(values) > 1:
        print(k, len(values) )

ffa3a8b71c 3
78cb73d0e8 2
bac6b4c5cc 3
402da42b54 2
4d9599c0a6 3
eb953c3a0a 2
071cee1adf 2
5024bacd64 2
d84d8c0b69 4
a8fc39ee4c 2
d57185cf28 2
98883b65de 2
d24a30cd61 2
754531c78f 2
a9d9c856a6 2
10fddff4bd 2
cbd3fe713f 2
f1c8b2d56f 2
8c49fa0b9a 2
ad5ef2f7f6 2
b1853c8b5a 2
8eddc22223 2
00f9762457 2
c0597e960a 2
7fe24cc54a 2
32aeb461e6 2
d4e7108828 2
710c85021c 2
6594c432e2 2
a7764efb5d 2
1054780676 2
a45c9fcfdd 2
a33a04683b 2
5b431f381b 2
8e48e57ab8 2
cba2706fdb 2
94c686fbe3 2
205a157d1c 2
d4e4cea242 2
368e4a94e5 2
647b7e3c64 2
246b0dfc78 2


In [135]:
hashes['0867df07b6']

[{'doc_id': '0867df07b6',
  'topic': 'cancer',
  'Question': 'What are the symptoms of Extragonadal Germ Cell Tumors ?',
  'Answer': 'Signs and symptoms of extragonadal germ cell tumors include breathing problems and chest pain. Malignant extragonadal germ cell tumors may cause signs and symptoms as they grow into nearby areas. Other conditions may cause the same signs and symptoms. Check with your doctor if you have any of the following: Chest pain. Breathing problems. Cough. Fever. Headache. Change in bowel habits. Feeling very tired. Trouble walking. Trouble in seeing or moving the eyes.'}]

In [105]:
with open('../Data_csvs/documents-json-hash.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)


In [107]:
!head ../Data_csvs/documents-json-hash.json

[
  {
    "id": 1,
    "topic": "cancer",
    "Question": "What is (are) Non-Small Cell Lung Cancer ?",
    "Answer": "Key Points Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung. There are several types of non-small cell lung cancer. Smoking is the major risk factor for non-small cell lung cancer. Signs of non-small cell lung cancer include a cough that doesn't go away and shortness of breath. Tests that examine the lungs are used to detect (find), diagnose, and stage non-small cell lung cancer. Certain factors affect prognosis (chance of recovery) and treatment options. For most patients with non-small cell lung cancer, current treatments do not cure the cancer. Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung. The lungs are a pair of cone-shaped breathing organs in the chest. The lungs bring oxygen into the body as you breathe in. They release carbon dioxide, a wast

## Using a llm to create the questions to evaluate retrieval

In [13]:
prompt_template = """
You are helping create evaluation questions for a medical question-answering system based on authoritative NIH health information.

Generate 5 diverse questions that patients, medical students, or healthcare seekers might realistically ask,
where the provided medical record contains the answer.

REQUIREMENTS:
- Questions should be complete, natural, and varied in phrasing
- Each question should be answerable using the information in the record
- Use different perspectives (patient concerns, educational queries, practical questions)
- Avoid copying exact phrases from the record - rephrase naturally
- Questions should reflect real-world medical information seeking behavior

The medical record:

Topic: {topic}
Question: {Question}
Answer: {Answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3", "question4", "question5"]
""".strip()

In [14]:
doc = documents[5]
prompt = prompt_template.format(**doc)
print(prompt)

You are helping create evaluation questions for a medical question-answering system based on authoritative NIH health information.

Generate 5 diverse questions that patients, medical students, or healthcare seekers might realistically ask,
where the provided medical record contains the answer.

REQUIREMENTS:
- Questions should be complete, natural, and varied in phrasing
- Each question should be answerable using the information in the record
- Use different perspectives (patient concerns, educational queries, practical questions)
- Avoid copying exact phrases from the record - rephrase naturally
- Questions should reflect real-world medical information seeking behavior

The medical record:

Topic: cancer
Question: what research (or clinical trials) is being done for Non-Small Cell Lung Cancer ?
Answer: New types of treatment are being tested in clinical trials. This summary section describes treatments that are being studied in clinical trials. It may not mention every new treatment 

In [23]:
load_dotenv(override=True)  # ← Force reload

# Get the API key
api_key = os.getenv('OPENAI_API_KEY')
print(f"API Key found: {api_key is not None}")

API Key found: True


## Running ollama through docker AI hub

In [22]:
import requests

# Check model info
response = requests.get('http://127.0.0.1:12434/v1/chat/completions')
print(response.text)

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=12434): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x332c5ea50>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [33]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    # Use 127.0.0.1 instead of localhost
    url = "http://127.0.0.1:12434/v1/chat/completions"

    data = {
        "model": "granite-4.0-h-tiny",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ]
    }

    response = requests.post(url, json=data, timeout=120)
    response.raise_for_status()

    result = response.json()
    json_response = result["choices"][0]["message"]["content"]
    return json_response

In [29]:
import requests

# Test if service is available
print("🔍 Testing Docker Model Runner...")
try:
    response = requests.get('http://localhost:12434/')
    print("✅ Docker Model Runner is running!")
    print(response.text[:200])
except Exception as e:
    print(f"❌ Not running: {e}")

# Use OpenAI-compatible endpoint
url = "http://localhost:12434/engines/llama.cpp/v1/chat/completions"

data = {
    "model": "ai/granite-4.0-h-tiny:latest",
    "messages": [
        {
            "role": "user",
            "content": prompt
        }
    ]
}

try:
    response = requests.post(url, json=data, timeout=60)
    response.raise_for_status()

    result = response.json()
    print("\n✅ Chat successful!")
    print(result["choices"][0]["message"]["content"])

except Exception as e:
    print(f"\n❌ Error: {e}")

🔍 Testing Docker Model Runner...
❌ Not running: HTTPConnectionPool(host='localhost', port=12434): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x33d771df0>: Failed to resolve 'localhost' ([Errno 8] nodename nor servname provided, or not known)"))
The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.

❌ Error: HTTPConnectionPool(host='localhost', port=12434): Max retries exceeded with url: /engines/llama.cpp/v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x33d771010>: Failed to resolve 'localhost' ([Errno 8] nodename nor servname provided, or not known)"))


### To slow so we move to ollama

In [12]:
url = "http://localhost:11434/api/generate"

data = {
    "model": "gpt-oss:20b",
    "prompt": prompt,
    "stream": False
}

try:
    response = requests.post(url, json=data, timeout=60)
    response.raise_for_status()

    result = response.json()
    print("\n✅ Ollama is working!")
    print(result["response"])

except Exception as e:
    print(f"\n❌ Error: {e}")


❌ Error: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/generate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11e7db4a0>: Failed to establish a new connection: [Errno 61] Connection refused'))


In [24]:
api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=api_key,
)


In [27]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='openai/gpt-oss-120b',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response


# def generate_questions(doc):
#     prompt = prompt_template.format(**doc)
#
#     # OpenAI-compatible endpoint
#     url = "http://localhost:12434/engines/llama.cpp/v1/chat/completions"
#
#     data = {
#         "model": "ai/granite-4.0-h-tiny:latest",
#         "messages": [
#             {
#                 "role": "user",
#                 "content": prompt
#             }
#         ]
#     }
#
#     response = requests.post(url, json=data, timeout=120)
#     response.raise_for_status()
#
#     result = response.json()
#     json_response = result["choices"][0]["message"]["content"]
#     return json_response

In [28]:
generate_questions(documents[0])

'["What exactly is non‑small cell lung cancer and how does it differ from other lung cancers?","Which factors increase my risk of developing non‑small cell lung cancer, and what symptoms should I watch for?","Can you explain the main subtypes of non‑small cell lung cancer and how they are identified under a microscope?","How is non‑small cell lung cancer diagnosed and staged, and what tests are typically used?","What are the current treatment goals for patients with non‑small cell lung cancer, and why is cure rarely achieved?"]'

In [30]:
results= {}
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

# results = {}
# max_workers = 4
#
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
#     future_to_doc = {
#         executor.submit(generate_questions, doc): doc
#         for doc in documents
#         if doc['id'] not in results
#     }
#
#     # Process completed tasks with progress bar
#     for future in tqdm(as_completed(future_to_doc), total=len(future_to_doc)):
#         doc_id, questions = future.result()
#         results[doc_id] = questions
#
# print(f"✅ Processed {len(results)} documents")

 16%|█▌        | 2580/16000 [2:16:57<11:52:23,  3.19s/it]


KeyboardInterrupt: 

In [59]:
results

{'5e34bcbaa8': '["What are the main types of non‑small cell lung cancer and how are they identified under a microscope?","Can smoking increase my risk of developing non‑small cell lung cancer, and what symptoms should prompt me to see a doctor?","How is non‑small cell lung cancer diagnosed and staged using lung tests?","Why is non‑small cell lung cancer generally not curable with current treatments, and what options are available for patients?","Are there clinical trials for non‑small cell lung cancer, and how can I find information about enrolling in one?"]',
 'c2845efbbc': '["What lifestyle habits increase my chances of developing non‑small cell lung cancer?","Can secondhand smoke exposure or workplace chemicals contribute to my risk of getting lung cancer?","How does starting to smoke at a young age affect my likelihood of non‑small cell lung cancer later on?","Are older adults automatically at higher risk for lung cancer even if they never smoked?","If I have a family history of lu

In [32]:
import json
with open('../Data_csvs/ground_truth.json', 'w') as f:
    json.dump(results, f, indent=2)

In [33]:
json.loads(results['5e34bcbaa8'])

['What are the main types of non‑small cell lung cancer and how are they identified under a microscope?',
 'Can smoking increase my risk of developing non‑small cell lung cancer, and what symptoms should prompt me to see a doctor?',
 'How is non‑small cell lung cancer diagnosed and staged using lung tests?',
 'Why is non‑small cell lung cancer generally not curable with current treatments, and what options are available for patients?',
 'Are there clinical trials for non‑small cell lung cancer, and how can I find information about enrolling in one?']

In [36]:
with open('../Data_csvs/ground_truth.json', 'r') as f:
    results = json.load(f)


### Fixing malformed questions

In [45]:
import json
import re

def fix_missing_commas_and_quotes(json_string):
    """Fix missing commas and quotes in LLM-generated question lists"""

    fixed = json_string

    # Pattern 1: ","Word → ","Word (missing opening quote after comma)
    fixed = re.sub(r'",([A-Z])', r'","\1', fixed)

    # Pattern 2: word" Word → word","Word (missing comma and opening quote)
    fixed = re.sub(r'([?.!])" ([A-Z])', r'\1","\2', fixed)

    # Pattern 3: word "Word → word","Word (missing closing quote, comma, opening quote)
    fixed = re.sub(r'([a-z?!.]) "([A-Z])', r'\1","\2', fixed)

    # NEW Pattern 4: ?" Word → ?","Word (question mark followed by quote, space, capital)
    fixed = re.sub(r'\?" ([A-Z])', r'?","\1', fixed)

    # NEW Pattern 5: word?\n" → word?",\n" (newline without comma)
    fixed = re.sub(r'\?"\s*\n\s*"', r'?",\n"', fixed)

    # NEW Pattern 6: Missing opening bracket
    if not fixed.startswith('['):
        fixed = '[' + fixed

    # NEW Pattern 7: Missing closing bracket
    if not fixed.endswith(']'):
        fixed = fixed + ']'

    # NEW Pattern 8: Unterminated string at end - add closing quote
    # Check if last quote is opening quote
    if fixed.count('"') % 2 != 0:
        # Odd number of quotes - missing a closing quote
        fixed = fixed.rstrip(']') + '"]'

    return fixed

# More aggressive fallback
def extract_questions_robust(json_string):
    """Extract questions by finding all question patterns"""

    # Find all strings that end with ?
    questions = re.findall(r'"([^"]*\?)"', json_string)

    if not questions:
        # Try without requiring quotes - find text ending with ?
        # Split by patterns and clean
        parts = re.split(r'["\[\],\n]+', json_string)
        questions = [p.strip() for p in parts if p.strip().endswith('?')]

    return questions if questions else None

# Apply to your data
parsed_results = {}
failed_details = []

for doc_id, json_questions in results.items():
    try:
        # Try original first
        parsed_results[doc_id] = json.loads(json_questions)
    except json.JSONDecodeError:
        # Apply fixes
        fixed_json = fix_missing_commas_and_quotes(json_questions)

        try:
            parsed_results[doc_id] = json.loads(fixed_json)
            print(f"✅ Fixed: {doc_id}")
        except json.JSONDecodeError as e:
            # Try extraction method
            extracted = extract_questions_robust(json_questions)

            if extracted:
                parsed_results[doc_id] = extracted
                print(f"⚠️  Extracted: {doc_id} ({len(extracted)} questions)")
            else:
                print(f"❌ Failed: {doc_id}")
                failed_details.append({
                    'doc_id': doc_id,
                    'original': json_questions[:200],
                    'fixed': fixed_json[:200],
                    'error': str(e)
                })

# Remove None values
parsed_results = {k: v for k, v in parsed_results.items() if v is not None}

print(f"\n📊 Results:")
print(f"   Total: {len(results)}")
print(f"   Parsed: {len(parsed_results)}")
print(f"   Failed: {len(failed_details)}")

# Show details of failures
if failed_details:
    print(f"\n🔍 Failed items details:")
    for item in failed_details:
        print(f"\n   doc_id: {item['doc_id']}")
        print(f"   Original: {item['original']}...")
        print(f"   Error: {item['error']}")


✅ Fixed: 8264eb7221
✅ Fixed: 1781f4f201
✅ Fixed: 8eb2316c8c
✅ Fixed: c24bf4f5d1
✅ Fixed: 39055ded32
✅ Fixed: 5fd1553350
⚠️  Extracted: fc0b4a4410 (4 questions)
⚠️  Extracted: c2654f2163 (4 questions)
⚠️  Extracted: 2c06e55326 (4 questions)
⚠️  Extracted: 28012ea00d (4 questions)

📊 Results:
   Total: 2556
   Parsed: 2556
   Failed: 0


In [60]:
parsed_results


{'5e34bcbaa8': ['What are the main types of non‑small cell lung cancer and how are they identified under a microscope?',
  'Can smoking increase my risk of developing non‑small cell lung cancer, and what symptoms should prompt me to see a doctor?',
  'How is non‑small cell lung cancer diagnosed and staged using lung tests?',
  'Why is non‑small cell lung cancer generally not curable with current treatments, and what options are available for patients?',
  'Are there clinical trials for non‑small cell lung cancer, and how can I find information about enrolling in one?'],
 'c2845efbbc': ['What lifestyle habits increase my chances of developing non‑small cell lung cancer?',
  'Can secondhand smoke exposure or workplace chemicals contribute to my risk of getting lung cancer?',
  'How does starting to smoke at a young age affect my likelihood of non‑small cell lung cancer later on?',
  'Are older adults automatically at higher risk for lung cancer even if they never smoked?',
  'If I have a

In [67]:
data = []
for doc_id, questions in parsed_results.items():
    for question in questions:
        data.append({
            'question': question,
            'doc_id': doc_id
        })

df = pd.DataFrame(data)

In [68]:
df

Unnamed: 0,question,doc_id
0,What are the main types of non‑small cell lung...,5e34bcbaa8
1,Can smoking increase my risk of developing non...,5e34bcbaa8
2,How is non‑small cell lung cancer diagnosed an...,5e34bcbaa8
3,Why is non‑small cell lung cancer generally no...,5e34bcbaa8
4,Are there clinical trials for non‑small cell l...,5e34bcbaa8
...,...,...
625,What are the typical warning signs that might ...,2c627474a5
626,Which locations within the eye are associated ...,2c627474a5
627,Do doctors normally have to take a tissue biop...,2c627474a5
628,Is being older and having a fair complexion co...,2c627474a5


In [69]:
df.to_csv('ground-truth-data.csv', index=False)


# Evaluating

### Minsearch

In [126]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient = 'records')
ground_truth


[{'question': 'What are the main types of non‑small cell lung cancer and how are they identified under a microscope?',
  'doc_id': '5e34bcbaa8'},
 {'question': 'Can smoking increase my risk of developing non‑small cell lung cancer, and what symptoms should prompt me to see a doctor?',
  'doc_id': '5e34bcbaa8'},
 {'question': 'How is non‑small cell lung cancer diagnosed and staged using lung tests?',
  'doc_id': '5e34bcbaa8'},
 {'question': 'Why is non‑small cell lung cancer generally not curable with current treatments, and what options are available for patients?',
  'doc_id': '5e34bcbaa8'},
 {'question': 'Are there clinical trials for non‑small cell lung cancer, and how can I find information about enrolling in one?',
  'doc_id': '5e34bcbaa8'},
 {'question': 'What lifestyle habits increase my chances of developing non‑small cell lung cancer?',
  'doc_id': 'c2845efbbc'},
 {'question': 'Can secondhand smoke exposure or workplace chemicals contribute to my risk of getting lung cancer?',

In [151]:
index = minsearch.Index(
    text_fields=['topic','Question', 'Answer' ],
    keyword_fields=['doc_id']
)

index.fit(documents)


<minsearch.minsearch.Index at 0x349537f50>

In [152]:
def search(query):
    boost = {}
    results = index.search(
        query = query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )
    return results

In [143]:
ground_truth[0]['question']

'What are the main types of non‑small cell lung cancer and how are they identified under a microscope?'

In [153]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['doc_id']
    results = search(query= q['question'])
    relevance = [d['doc_id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 630/630 [00:05<00:00, 125.63it/s]


In [155]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [156]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7047619047619048, 0.44645502645502616)

## Elastic search

In [160]:
from elasticsearch import Elasticsearch



In [174]:
es = Elasticsearch(
    "http://localhost:9200",
)
es. info()

ObjectApiResponse({'name': '2644975cab24', 'cluster_name': 'docker-cluster', 'cluster_uuid': '8pP0ViywRdafR3xLfWObhg', 'version': {'number': '9.1.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '5e94055934defa56e454868b7783b2a3b683785e', 'build_date': '2025-08-05T01:07:31.959947279Z', 'build_snapshot': False, 'lucene_version': '10.2.2', 'minimum_wire_compatibility_version': '8.19.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [175]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "doc_id": {"type": "keyword"},
            "topic": {"type": "keyword"},
            "Question": {"type": "text"},
            "Answer": {"type": "text"}
        }
    }
}

index_name = "medial-questions-answers"

es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'medial-questions-answers'})

In [176]:
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

100%|██████████| 16000/16000 [00:19<00:00, 821.43it/s]


In [177]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["Question^3", "Answer", "topic"],
                        "type": "best_fields"
                    }
                },

            }
        }
    }

    response = es.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [179]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['doc_id']
    results = elastic_search(query= q['question'])
    relevance = [d['doc_id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 630/630 [00:01<00:00, 461.56it/s]


In [181]:
hit_rate(relevance_total), mrr(relevance_total)


(0.5650793650793651, 0.3624867724867725)

## Semantic search/hybrid search with ElasticSearch

In [None]:
EMBEDDING_DIMENSIONALITY = 512

for model in TextEmbedding.list_supported_models():
    if model["dim"] == EMBEDDING_DIMENSIONALITY:
        print(json.dumps(model, indent=2))

model_handle = "jinaai/jina-embeddings-v2-small-en"

In [184]:
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
login(token=token)

In [185]:
embedding = []
model_handle = TextEmbedding("jinaai/jina-embeddings-v2-small-en")

for doc in tqdm(documents):
    embeddings = list(model_handle.embed([doc['Answer']]))
    doc['Answer_embedded'] = embeddings[0]
    embedding.append(doc)

print(embedding)

Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00,  3.36it/s]
100%|██████████| 16000/16000 [04:34<00:00, 58.34it/s] 
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [186]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "doc_id": {"type": "keyword"},
            "topic": {"type": "keyword"},
            "Question": {"type": "text"},
            "Answer": {"type": "text"},
            "Answer_embedded": {
                "type": "dense_vector",
                "dims": 512,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}
index_name= 'embedded_text'
es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body=index_settings)
## adding embeddings to documents
for doc in embedding:
    try:
        es.index(index=index_name, document=doc)
    except Exception as e:
        print (e)

In [187]:
def hybrid_search(query, k=5, text_weight=0.5, vector_weight=0.5):
    # Generate query embedding
    query_embedding = list(model_handle.embed([query]))[0]

    search_query = {
        "size": k,
        "query": {
            "script_score": {
                "query": {
                    "bool": {
                        "should": [
                            {
                                "multi_match": {
                                    "query": query,
                                    "fields": ["Question^3", "Answer", "topic"]
                                }
                            }
                        ]
                    }
                },
                "script": {
                    "source": f"""
                        double text_score = _score * {text_weight};
                        double vector_score = cosineSimilarity(params.query_vector, 'Answer_embedded') * {vector_weight};
                        return text_score + vector_score;
                    """,
                    "params": {
                        "query_vector": query_embedding
                    }
                }
            }
        }
    }

    response = es.search(index=index_name, body=search_query)
    return [hit['_source'] for hit in response['hits']['hits']]

# Test it
results = hybrid_search("What are brain tumor symptoms?")
print(f"Found {len(results)} results")
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc['Question'][:80]}...")

Found 5 results
1. What are the symptoms of Rhabdoid tumor ?...
2. What are the symptoms of Klatskin tumor ?...
3. What are the symptoms of Warthin tumor ?...
4. What are the symptoms of Desmoid tumor ?...
5. What are the symptoms of Glomus tympanicum tumor ?...


In [188]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['doc_id']
    results = hybrid_search(query= q['question'])
    relevance = [d['doc_id'] == doc_id for d in results]
    relevance_total.append(relevance)

hit_rate(relevance_total), mrr(relevance_total)


100%|██████████| 630/630 [00:06<00:00, 93.25it/s]


(0.5857142857142857, 0.3673544973544974)

### QDrant

In [189]:
from qdrant_client import QdrantClient, models


In [194]:
from qdrant_client import QdrantClient, models
from tqdm import tqdm

# Your collection setup
collection_name = "med-rag"
EMBEDDING_DIMENSIONALITY = 512  # jina-embeddings-v2-small-en

client = QdrantClient(url="http://localhost:6333")

# Delete and recreate collection
client.delete_collection(collection_name=collection_name)

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

# Prepare and upload points
points = []

for idx, doc in enumerate(tqdm(documents, desc="Creating points")):
    # Generate embedding for the Answer field
    embedding = list(model_handle.embed([doc['Answer']]))[0]

    point = models.PointStruct(
        id=idx,
        vector=embedding,
        payload={
            "doc_id": doc['doc_id'],
            "topic": doc['topic'],
            "Question": doc['Question'],
            "Answer": doc['Answer']
        }
    )
    points.append(point)

# Upload to Qdrant in batches
batch_size = 100
for i in tqdm(range(0, len(points), batch_size), desc="Uploading batches"):
    batch = points[i:i+batch_size]
    client.upsert(
        collection_name=collection_name,
        points=batch
    )

print(f"Uploaded {len(points)} points to collection '{collection_name}'")


Creating points: 100%|██████████| 16000/16000 [04:18<00:00, 61.85it/s] 
Uploading batches: 100%|██████████| 160/160 [00:04<00:00, 34.33it/s]

Uploaded 16000 points to collection 'med-rag'





In [195]:
def qdrant_search(query, k=5):
    # Generate query embedding
    query_embedding = list(model_handle.embed([query]))[0]

    # Search in Qdrant
    search_results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=k
    )

    # Extract results
    results = []
    for result in search_results:
        results.append(result.payload)

    return results

# Test it
results = qdrant_search("What are brain tumor symptoms?")
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc['Question'][:80]}...")

1. What is (are) Brain Tumors ?...
2. What are the symptoms of Adult Central Nervous System Tumors ?...
3. What are the symptoms of Glioblastoma ?...
4. What are the symptoms of Gangliocytoma ?...
5. What are the symptoms of Childhood Brain and Spinal Cord Tumors ?...


  search_results = client.search(


In [197]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['doc_id']
    results = qdrant_search(query= q['question'])
    relevance = [d['doc_id'] == doc_id for d in results]
    relevance_total.append(relevance)

hit_rate(relevance_total), mrr(relevance_total)

  search_results = client.search(
100%|██████████| 630/630 [00:02<00:00, 217.58it/s]


(0.773015873015873, 0.6320370370370364)

# Finding the best params

In [None]:
df_validation = df_ground_truth[:100]
df_test = df_ground_truth[100:]

In [None]:
import random


def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evaluate the objective function
        current_score = objective_function(current_params)

        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score


gt_val = df_validation.to_dict(orient='records')


In [None]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [None]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
    'type_of_activity': (0.0, 3.0),
    'type_of_equipment': (0.0, 3.0),
    'body_part': (0.0, 3.0),
    'type': (0.0, 3.0),
    'muscle_groups_activated': (0.0, 3.0),
    'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [None]:
simple_optimize(param_ranges, objective, n_iterations=20)