In [1]:
import pandas as pd
import minsearch
import json

from tqdm.auto import tqdm
from openai import OpenAI
from elasticsearch import Elasticsearch

  from .autonotebook import tqdm as notebook_tqdm


## Load Document

In [3]:
df = pd.read_csv('../notebooks/data/claims.csv')

In [4]:
documents = df.to_dict(orient='records')

In [5]:
documents[190]

{'category': 'Low income',
 'question': 'What is the Help to Save scheme?',
 'answer': 'Help to Save is a government savings account for people on low incomes offering bonuses on your savings.',
 'section': 'general claim benefits'}

In [7]:
# df_ground_truth = pd.read_csv('notebooks/ground-truth-data.csv')
# ground_truth = df_ground_truth.to_dict(orient='records')

In [8]:
# ground_truth[190]

## Minsearch

In [9]:
index = minsearch.Index(
    text_fields=["question", "answer", "category"],
    keyword_fields=["section"]
)
index.fit(documents)

<minsearch.Index at 0x7dc72c1b5640>

In [10]:
q = 'What is the Help to Save scheme?'

## Implement a LLM that accepts query

In [11]:
# client = OpenAI()
# response = client.chat.completions.create(
#     model='gpt-4o',
#     messages=[{"role": "user", "content": q}]
# )

# response.choices[0].message.content

## Perform a search of the document

In [12]:
def search(query):
    boost = {'question': 3.0, 'answer': 0.5}

    results = index.search(
        query=query,
        filter_dict={'section': 'general claim benefits'},
        boost_dict=boost,
        num_results=10
    )

    return results

In [13]:
search(q)

[{'category': 'Low income',
  'question': 'What is the Help to Save scheme?',
  'answer': 'Help to Save is a government savings account for people on low incomes offering bonuses on your savings.',
  'section': 'general claim benefits'},
 {'category': 'Looking for work',
  'question': 'What is the Restart Scheme?',
  'answer': 'The Restart Scheme offers tailored support to help long-term unemployed people find work.',
  'section': 'general claim benefits'},
 {'category': 'Looking for work',
  'question': 'What is the Kickstart Scheme?',
  'answer': 'The Kickstart Scheme provides funding to employers to create jobs for young people.',
  'section': 'general claim benefits'},
 {'category': 'Disabled or health condition',
  'question': 'What is the Motability Scheme?',
  'answer': 'The Motability Scheme helps disabled people lease a car scooter or powered wheelchair.',
  'section': 'general claim benefits'},
 {'category': 'Disabled or health condition',
  'question': 'What is the Blue Badg

## RAG flow
- Building a prompt

In [14]:
def build_prompt(query, search_results):
    prompt_template = """
You are an expert in United Kingdom Benefit Claims and Medical Negligence Claims. Answer the QUESTION based on the CONTEXT from 
the FAQ databases of Benefits database and NHS claims management. 
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"category: {doc['category']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\nsection: {doc['section']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

In [15]:
client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [16]:
query = "What is the Help to Save scheme?"

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag(query)

## TextSearch: Elasticsearch

#### Indexing and Mapping elasticsearch

In [55]:
es_client = Elasticsearch('http://localhost:9200') 

In [56]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "answer": {"type": "text"},
            "category": {"type": "text"},
            "question": {"type": "text"},
            "section": {"type": "keyword"} 
        }
    }
}

index_name = "benefit-claims"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'benefit-claims'})

In [57]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 425/425 [00:08<00:00, 50.33it/s]


In [58]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "answer", "category"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "section": "general claim benefits"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [59]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [60]:
query = "What is the Help to Save scheme?"
rag(query)

'The Help to Save scheme is a government savings account designed for people on low incomes, offering bonuses on your savings.'

## Vector Search 

In [61]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [62]:
len(model.encode("Getting size of model dim"))

384

In [63]:
df = pd.read_csv('notebooks/claims.csv')
documents = df.to_dict(orient='records')

## Mapping and Index

In [64]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "answer": {"type": "text"},
            "category": {"type": "text"},
            "question": {"type": "text"},
            "section": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "benefit-claims"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'benefit-claims'})

In [65]:
for doc in tqdm(documents):
    question = doc['question']
    answer = doc['answer']
    doc['question_answer_vector'] = model.encode(question + ' ' + answer)

    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 425/425 [00:16<00:00, 25.66it/s]


In [66]:
def elastic_search_knn(field, vector, section):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "section": section
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["answer", "section", "question", "category", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [67]:
def question_answer_vector_knn(q):
    question = q['question']
    section = q['section']

    v_q = model.encode(question)

    return elastic_search_knn('question_answer_vector', v_q, section)

## Perform a Vector Search

In [68]:
question_answer_vector_knn(dict(
    question='What is the Local Welfare Assistance scheme?',
    section='general claim benefits'
))

[{'question': 'What is the Local Welfare Assistance scheme?',
  'answer': 'The Local Welfare Assistance scheme provides emergency financial help often for things like food clothing and utilities.',
  'section': 'general claim benefits',
  'category': 'Low income'},
 {'question': 'What is the Help to Save scheme?',
  'answer': 'Help to Save is a government savings account for people on low incomes offering bonuses on your savings.',
  'section': 'general claim benefits',
  'category': 'Low income'},
 {'question': 'What is the Hardship Fund?',
  'answer': 'The Hardship Fund provides financial assistance to those in urgent need often administered by local councils.',
  'section': 'general claim benefits',
  'category': 'Low income'},
 {'question': 'What is the Restart Scheme?',
  'answer': 'The Restart Scheme offers tailored support to help long-term unemployed people find work.',
  'section': 'general claim benefits',
  'category': 'Looking for work'},
 {'question': 'What is the Healthy 

## Rag flow with Vector Search

In [69]:
def build_prompt(query, search_results):
    prompt_template = """
You are an expert in United Kingdom Benefit Claims and Medical Negligence Claims. Answer the QUESTION based on the CONTEXT from 
the FAQ databases of Benefits database and NHS claims management. 
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"category: {doc['category']}\nquestion: {doc['question']}\nanswer: {doc['answer']}\nsection: {doc['section']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [70]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [71]:
def rag(query: dict, model='gpt-4o-mini') -> str:
    search_results = question_answer_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [72]:
rag(ground_truth[10])

'Yes, it is possible to appeal a decision regarding benefit claims. You can appeal within one month of the decision notice by requesting a mandatory reconsideration. If necessary, you can then appeal to a tribunal, particularly for decisions such as Employment and Support Allowance (ESA). If your application is rejected, you also have the option to request a mandatory reconsideration or appeal the decision.'

In [74]:
ground_truth[10]

{'question': 'Is it possible to appeal?',
 'section': 'general claim benefits',
 'document': '8d000ade-6c2b-571c-aa61-5d38eb463cf8'}