In [1]:
import pandas as pd

## Ingestion

In [2]:
df = pd.read_csv('kenya_health_facilities.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8932 entries, 0 to 8931
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Code                  8932 non-null   int64  
 1   Name                  8932 non-null   object 
 2   Registration_number   119 non-null    object 
 3   Keph level            8920 non-null   object 
 4   Facility type         8932 non-null   object 
 5   Owner                 8932 non-null   object 
 6   Regulatory body       8604 non-null   object 
 7   Beds                  8932 non-null   int64  
 8   Cots                  8932 non-null   int64  
 9   County                8932 non-null   object 
 10  Constituency          8932 non-null   object 
 11  Sub county            8932 non-null   object 
 12  Ward                  8932 non-null   object 
 13  Operation status      8932 non-null   object 
 14  Open_whole_day        8932 non-null   object 
 15  Open_public_holidays 

In [4]:
df.describe()

Unnamed: 0,Code,Beds,Cots,Service_names
count,8932.0,8932.0,8932.0,0.0
mean,16030.920958,6.094828,0.713726,
std,3489.301803,31.561165,6.121864,
min,10001.0,0.0,0.0,
25%,13110.75,0.0,0.0,
50%,15769.5,0.0,0.0,
75%,18904.0,2.0,0.0,
max,22998.0,1455.0,427.0,


In [5]:
# Drop duplicates
df = df.drop_duplicates(subset='Name')

In [6]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-10-23 12:35:27--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.3’


2024-10-23 12:35:27 (145 MB/s) - ‘minsearch.py.3’ saved [3832/3832]



In [7]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [8]:
# List of columns to drop
columns_to_drop = ['registration_number', 'service_names']

# Drop the specified columns
df = df.drop(columns=columns_to_drop)

In [9]:
df = df.dropna(subset=['regulatory_body', 'keph_level'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8598 entries, 2 to 8931
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   code                  8598 non-null   int64 
 1   name                  8598 non-null   object
 2   keph_level            8598 non-null   object
 3   facility_type         8598 non-null   object
 4   owner                 8598 non-null   object
 5   regulatory_body       8598 non-null   object
 6   beds                  8598 non-null   int64 
 7   cots                  8598 non-null   int64 
 8   county                8598 non-null   object
 9   constituency          8598 non-null   object
 10  sub_county            8598 non-null   object
 11  ward                  8598 non-null   object
 12  operation_status      8598 non-null   object
 13  open_whole_day        8598 non-null   object
 14  open_public_holidays  8598 non-null   object
 15  open_weekends         8598 non-null   objec

In [11]:
df.insert(0, 'id', df.index)

In [12]:
df.columns

Index(['id', 'code', 'name', 'keph_level', 'facility_type', 'owner',
       'regulatory_body', 'beds', 'cots', 'county', 'constituency',
       'sub_county', 'ward', 'operation_status', 'open_whole_day',
       'open_public_holidays', 'open_weekends', 'open_late_night', 'approved',
       'public_visible', 'closed'],
      dtype='object')

In [13]:
df.to_csv('kenya_health_facilities_clean.csv', index=False)

In [14]:
df = pd.read_csv('kenya_health_facilities_clean.csv')

In [16]:
documents = df.to_dict(orient='records')

In [17]:
import minsearch

In [18]:
index = minsearch.Index(
    text_fields=['name', 'keph_level', 'facility_type',
       'owner', 'regulatory_body', 'county', 'constituency',
       'sub_county', 'ward', 'operation_status', 'open_whole_day',
       'open_public_holidays', 'open_weekends', 'open_late_night', 
       'approved', 'public_visible', 'closed'],
    keyword_fields=['id']
)

In [19]:
index.fit(documents)

<minsearch.Index at 0x7f3972407fd0>

## RAG Flow

In [20]:
from openai import OpenAI
import os

In [21]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [23]:
client = OpenAI(
    base_url="https://models.inference.ai.azure.com",
    api_key=os.environ["GITHUB_TOKEN"]
)

In [24]:

prompt_template = """
You're a course healthcare information assistant. Answer the QUESTION based on the CONTEXT from the healthcare database. 
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: {context}
""".strip()

entry_template = """
name: {name}
keph_level: {keph_level}
facility_type: {facility_type}
owner: {owner}
regulatory_body: {regulatory_body}
beds: {beds}
cots: {cots}
county: {county}
constituency: {constituency}
sub_county: {sub_county}
ward: {ward} 
operation_status: {operation_status}
open_whole_day: {open_whole_day}
open_public_holidays: {open_public_holidays}
open_weekends: {open_weekends}
open_late_night: {open_late_night}
approved: {approved}
public_visible: {public_visible}
closed: {closed}
"""

def build_prompt(query, search_results):

    context_str = ""

    for doc in search_results:
        context_str = context_str + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context_str).strip()
    return prompt 

In [67]:
def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model = model,
        messages = [{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [26]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    response = llm(prompt)
    return response

## Retrieval Evaluation 

In [28]:
df_questions = pd.read_csv('ground_truth_retrieval.csv')

In [29]:
df_questions.head()

Unnamed: 0,id,question
0,3,What is the name of the secondary care hospita...
1,3,How many beds are available at the facility in...
2,3,Is the hospital in Umoja open on public holidays?
3,3,In which constituency is the hospital located?
4,3,Is the hospital in Umoja visible to the public?


In [30]:
ground_truth = df_questions.to_dict(orient='records')

In [32]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [33]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [34]:
from tqdm import tqdm

In [35]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [36]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

100%|██████████| 130/130 [00:01<00:00, 95.85it/s]


{'hit_rate': 0.5230769230769231, 'mrr': 0.42673992673992683}

In [37]:
df_validation = df_questions[:80]
df_test = df_questions[80:]

In [38]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope

In [39]:
import random 

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf') # Assuming we are maximizing. Change to float('inf') if minimizing

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evaluate the objective function
        current_score = objective_function(current_params)

        # Update the best score and best parameters
        if current_score > best_score: # Change to < if minimizing
            best_score = current_score
            best_params = current_params

    return best_params, best_score

In [40]:
gt_validation = df_validation.to_dict(orient='records')

In [41]:
evaluate(gt_validation, lambda q: minsearch_search(q['question']))

100%|██████████| 80/80 [00:00<00:00, 96.89it/s]


{'hit_rate': 0.6125, 'mrr': 0.5013690476190477}

In [42]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [43]:
param_ranges = {
    'name': (0.0, 3.0),
    'keph_level': (0.0, 3.0),
    'facility_type': (0.0, 3.0),
    'owner': (0.0, 3.0),
    'regulatory_body': (0.0, 3.0),
    'beds': (0.0, 3.0),
    'cots': (0.0, 3.0),
    'county': (0.0, 3.0),
    'constituency': (0.0, 3.0),
    'sub_county': (0.0, 3.0),
    'ward': (0.0, 3.0),
    'operation_status': (0.0, 3.0),
    'open_whole_day': (0.0, 3.0),
    'open_public_holidays': (0.0, 3.0),
    'open_weekends': (0.0, 3.0),
    'open_late_night': (0.0, 3.0),
    'approved': (0.0, 3.0),
    'public_visible': (0.0, 3.0),
    'closed': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)
    
    results = evaluate(gt_validation, search_function)
    return results['mrr']


In [45]:
simple_optimize(param_ranges, objective, n_iterations=20)

100%|██████████| 80/80 [00:00<00:00, 93.91it/s]
100%|██████████| 80/80 [00:00<00:00, 97.20it/s]
100%|██████████| 80/80 [00:00<00:00, 93.93it/s]
100%|██████████| 80/80 [00:00<00:00, 97.41it/s]
100%|██████████| 80/80 [00:00<00:00, 96.43it/s]
100%|██████████| 80/80 [00:00<00:00, 96.18it/s] 
100%|██████████| 80/80 [00:00<00:00, 96.25it/s]
100%|██████████| 80/80 [00:00<00:00, 96.43it/s]
100%|██████████| 80/80 [00:00<00:00, 95.79it/s]
100%|██████████| 80/80 [00:00<00:00, 95.63it/s]
100%|██████████| 80/80 [00:00<00:00, 99.08it/s] 
100%|██████████| 80/80 [00:00<00:00, 95.23it/s]
100%|██████████| 80/80 [00:00<00:00, 94.07it/s]
100%|██████████| 80/80 [00:00<00:00, 93.86it/s]
100%|██████████| 80/80 [00:00<00:00, 100.06it/s]
100%|██████████| 80/80 [00:00<00:00, 97.46it/s]
100%|██████████| 80/80 [00:00<00:00, 95.05it/s]
100%|██████████| 80/80 [00:00<00:00, 95.96it/s]
100%|██████████| 80/80 [00:00<00:00, 98.09it/s]
100%|██████████| 80/80 [00:00<00:00, 99.92it/s] 


({'name': 2.7950534105260223,
  'keph_level': 2.7489453465667903,
  'facility_type': 1.1977931582083436,
  'owner': 0.81696566726373,
  'regulatory_body': 0.4931183900402659,
  'beds': 1.6647728378062763,
  'cots': 2.1129189199525023,
  'county': 0.1915983672418855,
  'constituency': 0.04662222203409139,
  'sub_county': 2.3364436616565283,
  'ward': 1.4940398577402014,
  'operation_status': 2.3978788328309935,
  'open_whole_day': 0.5873009214691289,
  'open_public_holidays': 2.26294800458026,
  'open_weekends': 0.8827119704348237,
  'open_late_night': 1.2935230325904739,
  'approved': 1.6423404738758443,
  'public_visible': 0.335607000093775,
  'closed': 0.8436428861937306},
 0.7685069444444445)

In [46]:
def minsearch_improved(query):
    boost = {
        'name': 2.50,
        'keph_level': 2.76,
        'facility_type': 0.28,
        'owner': 0.79,
        'regulatory_body': 0.45,
        'beds': 2.57,
        'cots': 2.16,
        'county': 2.06,
        'constituency': 1.62,
        'sub_county': 0.63,
        'ward': 0.13,
        'operation_status': 1.76,
        'open_whole_day': 0.45,
        'open_public_holidays': 0.29,
        'open_weekends': 0.36,
        'open_late_night': 0.60,
        'approved': 0.96,
        'public_visible': 0.91,
        'closed': 0.01,
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [47]:
evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

100%|██████████| 130/130 [00:01<00:00, 96.94it/s]


{'hit_rate': 0.6923076923076923, 'mrr': 0.6619871794871794}

## RAG evaluation

In [48]:
prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [57]:
import json

In [None]:
evaluations = []

for record in tqdm(ground_truth):
    id = record['id']

    if id in evaluations:
        continue

    question = record['question']
    answer_llm = rag(question)
    
    prompt = prompt2_template.format(
        question=record['question'], 
        answer_llm=answer_llm
    )
    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

In [None]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

In [None]:
df_eval['id'] = df_eval['record'].apply(lambda x: x['id'])
df_eval['question'] = df_eval['record'].apply(lambda x: x['question'])

df_eval['relevance'] = df_eval['evaluation'].apply(lambda x: x['Relevance'])
df_eval['explanation'] = df_eval['evaluation'].apply(lambda x: x['Explanation'])

In [None]:
del df_eval['record']
del df_eval['evaluation']

In [66]:
df_eval.relevance.value_counts()

23