In [1]:
import json
import os
from dotenv import load_dotenv
load_dotenv()
PROJECT_ID = os.getenv("PROJECT_ID")
SPACE_ID = os.getenv("SPACE_ID")
IBM_CLOUD_APIKEY = os.getenv("IBM_CLOUD_APIKEY")
IBM_CLOUD_REGION = os.getenv("IBM_CLOUD_REGION")
COS_ASSET_ID = os.getenv("COS_ASSET_ID")
COS_BUCKET_NAME = os.getenv("COS_BUCKET_NAME")
ESUSER = os.getenv("ESUSER")
ESPASSWORD = os.getenv("ESPASSWORD")
ESHOST = os.getenv("ESHOST")
ESPORT = os.getenv("ESPORT")

import ibm_watsonx_ai
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials

credentials = Credentials(
                   url = "https://us-south.ml.cloud.ibm.com",
                   api_key = IBM_CLOUD_APIKEY,
                  )

client = APIClient(credentials)
client.set.default_space(SPACE_ID)
client.set.default_project(PROJECT_ID)

Unsetting the space_id ...


'SUCCESS'

In [None]:

#client.foundation_models.get_embeddings_model_specs()
conns = client.connections
conn_list = conns.list()
conns.get_details(connection_id=None)


In [3]:
from elasticsearch import AsyncElasticsearch
from llama_index.core import VectorStoreIndex, PromptTemplate, Settings
from llama_index.vector_stores.elasticsearch import ElasticsearchStore

In [4]:
import re
from ibm_watsonx_ai.helpers.connections import DataConnection, S3Location
from langchain_core.documents import Document

def preprocess_text(lines):
    processed_lines = []
    length = len(lines)
    
    for i in range(length):
        line = lines[i].strip()
        if line:  # check if the line is not empty
            if i + 1 < length and lines[i + 1].strip():  # Next line exists and is not empty
                if not re.search(r'[.!?]$', line):  # If line does not end with punctuation
                    line += ','
            elif not re.search(r'[.!?]$', line):  # Next line does not exist or is empty
                line += '.'
        processed_lines.append(line)
    
    return '\n'.join(processed_lines)

docs_to_retrieve = ["homepage", "methotrexate", "epipen", "folate", "ibuprofen", "prescriptionrefill", "appointment", "ra"]
metadata = [{"name": "homepage", "source": "https://corp.nhg.com.sg/Pages/Digital%20Version%202022-2023/directory.pdf"},
            {"name": "methotrexate", "source": "https://www.ttsh.com.sg/Patients-and-Visitors/Medical-Services/Pharmacy/Documents/Pharmacy/PIL/PIL_by_Drug/Methotrexate_RAI_.pdf"},
            {"name": "epipen", "source": "https://www.ttsh.com.sg/Patients-and-Visitors/Medical-Services/Pharmacy/Documents/Pharmacy/PIL/PIL_by_Drug/Adrenaline_Autoinjector.pdf"},
            {"name": "folate", "source": "https://www.ttsh.com.sg/Patients-and-Visitors/Medical-Services/Pharmacy/Documents/Pharmacy/PIL/PIL_by_Drug/Folic_Acid.pdf"},
            {"name": "ibuprofen", "source": "https://www.healthdirect.gov.au/ibuprofen"},
            {"name": "prescriptionrefill", "source": "https://www.ttshpharmezy.nhg.com.sg/prescription"},
            {"name": "appointment", "source": "https://www.ttsh.com.sg/Patients-and-Visitors/Your-Clinic-Visit/Pages/Appointments.aspx"},
            {"name": "ra", "source":"https://www.ttsh.com.sg/Patients-and-Visitors/Pages/Find-Conditions-and-Treatments-Details.aspx?condition=Rheumatoid-Arthritis"}]
dataset_list = []
for i in range(len(docs_to_retrieve)):
    training_data_references = DataConnection(
        connection_asset_id=COS_ASSET_ID,
        location=S3Location(bucket=COS_BUCKET_NAME,   # note: COS bucket name where training dataset is located
                            path=docs_to_retrieve[i]+".csv",  # note: path within bucket where your training dataset is located
                            )
    )
    training_data_references.set_client(client)
    dataset = training_data_references.read(encoding='utf-8')
    documents = dataset['0'].fillna('\n').tolist()
    processed_text = preprocess_text(documents)
    document =Document(metadata=metadata[i], page_content=processed_text)
    print(document)
    dataset_list.append(document)


page_content='TAN TOCK SENG HOSPITAL PTE LTD,\nAddress : 11 Jalan Tan Tock Seng Singapore 308433,\nTelephone: 62566011,\nWebsite: https://www.ttsh.com.sg.\n\nTan Tock Seng Hospital (TTSH) is the \uf0dfagship hospital of NHG and part of Singapore’s public healthcare system. As a pioneering hospital with strong roots in the community for over 175 years, TTSH is recognised as the People’s Hospital, serving a resident population of 1.4 million living in Central Singapore.\nTogether, with 70 community partners and 80 community health posts, it brings care beyond the hospital into,\nthe community as an Integrated Care Organisation — Central Health.\nAs one of the largest multidisciplinary hospitals in Singapore, TTSH operates more than 1,700 beds,\nwith centres of excellence including the National Centre for Infectious Diseases (NCID), Institute for Geriatrics & Active Ageing (IGA), NHG Eye Institute (NHGEI), TTSH Rehabilitation Centre, and Ang Mo Kio Specialist Centre (AMKSC).\nTTSH’s 600-b

In [None]:
# #turn ttsh.txt into a csv file
import pandas as pd
#for every fil in /documents folder
#read in the file
#split the file into lines
#write the lines to a csv file
#write the csv file to the data connection
training_data_references.set_client(client)

for filename in os.listdir('documents'):
    
    filename = filename.split('.')[0]
    with open('documents/'+filename + ".txt", 'r') as file:
        data = file.read()
    data = data.split('\n')
    df = pd.DataFrame(data)
    df.to_csv('documents/'+filename + '.csv', index=False)
    print(filename)
    training_data_references.write(data='documents/'+filename + '.csv', remote_name= filename + '.csv') #writing works



document splitting

In [5]:
dataset_list

[Document(page_content='TAN TOCK SENG HOSPITAL PTE LTD,\nAddress : 11 Jalan Tan Tock Seng Singapore 308433,\nTelephone: 62566011,\nWebsite: https://www.ttsh.com.sg.\n\nTan Tock Seng Hospital (TTSH) is the \uf0dfagship hospital of NHG and part of Singapore’s public healthcare system. As a pioneering hospital with strong roots in the community for over 175 years, TTSH is recognised as the People’s Hospital, serving a resident population of 1.4 million living in Central Singapore.\nTogether, with 70 community partners and 80 community health posts, it brings care beyond the hospital into,\nthe community as an Integrated Care Organisation — Central Health.\nAs one of the largest multidisciplinary hospitals in Singapore, TTSH operates more than 1,700 beds,\nwith centres of excellence including the National Centre for Infectious Diseases (NCID), Institute for Geriatrics & Active Ageing (IGA), NHG Eye Institute (NHGEI), TTSH Rehabilitation Centre, and Ang Mo Kio Specialist Centre (AMKSC).\nTT

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50) #how to make splitting better semantically?
texts = text_splitter.split_documents(dataset_list)
#get page_content from each text

In [7]:
print(texts[0])

page_content='TAN TOCK SENG HOSPITAL PTE LTD,\nAddress : 11 Jalan Tan Tock Seng Singapore 308433,\nTelephone: 62566011,\nWebsite: https://www.ttsh.com.sg.' metadata={'name': 'homepage', 'source': 'https://corp.nhg.com.sg/Pages/Digital%20Version%202022-2023/directory.pdf'}


In [None]:
client.connections.get_details(connection_id=COS_ASSET_ID)

In [None]:
esuser =  ESUSER
espassword = ESPASSWORD
eshost = ESHOST
esport = ESPORT
es_ssl_fingerprint = !openssl s_client -connect $eshost:$esport  -showcerts </dev/null 2>/dev/null | openssl x509 -fingerprint -sha256 -noout -in /dev/stdin
es_ssl_fingerprint = es_ssl_fingerprint[0].split("=")[1]
from elasticsearch import Elasticsearch

es_client = Elasticsearch(
    f'https://{eshost}:{esport}',
    basic_auth=(esuser, espassword),
    verify_certs=False,
    request_timeout=3600,
    ssl_assert_fingerprint=es_ssl_fingerprint
)
es_client.info()

In [11]:
model_id = '.elser_model_2'
model_schema = {"input": {"field_names": ['text']}}
es_client.ml.get_trained_models(
            model_id='.elser_model_2')

deployment_id = model_id
existing_deployments = (
    es_client.ml.get_trained_models_stats(model_id=model_id)
    .body["trained_model_stats"][0]
    .get("deployment_stats")
)
if (
    existing_deployments
    and existing_deployments.get("deployment_id") == deployment_id
):
    print(f"{model_id} model deployment with the same name already exists.")
else:
    print(
        f"Creating {model_id} model deployment with deployment id {deployment_id}..."
    )
    es_client.ml.start_trained_model_deployment(
        model_id=model_id, deployment_id=deployment_id
    )

# es_client.ingest.put_pipeline(
#     id='elser-ingest-pipeline',
#     processors=[
#         {
#             'inference': {
#                 'model_id': model_id,
#                 'input_output': [
#                     {
#                         'input_field': 'text',
#                         'output_field': 'elser_embedding',
#                     }
#                 ]
#             }
#         }
#     ]
# )
# try:
#     es_client.indices.delete(index='my_documents', ignore_unavailable=True)
#     es_client.indices.create(
#             index='my_documents',
#             mappings={
#                 'properties': {
#                     'embedding': {
#                         'type': 'dense_vector',
#                     },
#                     'elser_embedding': {
#                         'type': 'sparse_vector',
#                     },
#                 }
#             },
#             settings={
#                 'index': {
#                     'default_pipeline': 'elser-ingest-pipeline'
#                 }
#             }
#         )
#     print("Index created successfully.")
# except:
#     print("Index already exists.")


# operations = []
# for document in documents:
#     operations.append({'index': {'_index': 'my_documents'}})
#     operations.append({'text': document})
# es_client.bulk(operations=operations, index='my_documents', pipeline='elser-ingest-pipeline', refresh=True)


# print("Document ingested successfully.")

.elser_model_2 model deployment with the same name already exists.


In [12]:
es_client.ingest.put_pipeline(
    id='elser-ingest-pipeline',
    processors=[
        {
            'inference': {
                'model_id': model_id,
                'input_output': [
                    {
                        'input_field': 'page_content',
                        'output_field': 'elser_embedding',
                    }
                ]
            }
        }
    ]
)
try:
    es_client.indices.delete(index='my_documents1', ignore_unavailable=True)
    es_client.indices.create(
        index='my_documents1',
        body={
            'mappings': {
                'properties': {
                    'embedding': {
                        'type': 'dense_vector',
                    },
                    'elser_embedding': {
                        'type': 'sparse_vector',
                    },
                    'metadata': {
                        'properties': {
                            'name': {
                                'type': 'keyword'
                            },
                            'source': {
                                'type': 'keyword'
                            }
                        }
                    }
                }
            },
            'settings': {
                'index': {
                    'default_pipeline': 'elser-ingest-pipeline'
                }
            }
        }
    )
    print("Index created successfully.")
except Exception as e:
    print(f"Error creating index: {e}")


# Prepare the bulk indexing operations
operations = []
for doc in texts:
    operations.append({'index': {'_index': 'my_documents1'}})
    operations.append({
        'page_content': doc.page_content,
        'metadata': doc.metadata
    })

# Perform the bulk indexing
es_client.bulk(operations=operations, index='my_documents1', pipeline='elser-ingest-pipeline', refresh=True)


Index created successfully.


ObjectApiResponse({'errors': False, 'took': 513, 'ingest_took': 34096, 'items': [{'index': {'_index': 'my_documents1', '_id': 'xNYwkZABuavOP9ZInwcg', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 0, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_documents1', '_id': 'xdYwkZABuavOP9ZInwcg', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 1, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_documents1', '_id': 'xtYwkZABuavOP9ZInwcg', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 2, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_documents1', '_id': 'x9YwkZABuavOP9ZInwcg', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 3, '_primary_term':

In [13]:
parsed_query = "how do i book an appointment?"
response = es_client.search(
        index='my_documents1', 
        body={
            'query': {
                'bool': {
                    'must': [
                        {
                            'text_expansion': {
                                'elser_embedding': {
                                    'model_id': '.elser_model_2',
                                    'model_text': parsed_query,
                                }
                            },
                        }
                    ],
                }
            },
            'aggs': {
                'category-agg': {
                    'terms': {
                        'field': 'category.keyword',
                    }
                },
            },
            'size': 5,
            "min_score": 8 ,
        },
        
)

In [14]:
response.body

{'took': 224,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 12.266482,
  'hits': [{'_index': 'my_documents1',
    '_id': 'AdYwkZABuavOP9ZInwgh',
    '_score': 12.266482,
    '_ignored': ['page_content.keyword'],
    '_source': {'elser_embedding': {'cancel': 1.5640881,
      '00': 0.20025246,
      '##b': 1.0955287,
      'weekends': 0.26215824,
      '03': 0.025296943,
      'pt': 0.18588853,
      '##h': 0.817397,
      'bring': 0.19405667,
      'your': 0.07379847,
      '##p': 0.0504971,
      'identification': 0.9596832,
      'jan': 0.48785138,
      'pension': 0.03968984,
      'zealand': 0.62268114,
      'tel': 0.2840549,
      'welcome': 0.14226456,
      'mauritius': 0.68640035,
      'app': 0.78509635,
      'appointments': 0.9720861,
      'australia': 0.38164002,
      'benefit': 1.1089985,
      'morning': 0.08472396,
      'badge': 0.22395556,
      'cancellatio

In [19]:
response.get('hits').get('hits')[0].get('_source').get('metadata')['source']

'https://www.ttsh.com.sg/Patients-and-Visitors/Your-Clinic-Visit/Pages/Appointments.aspx'

In [15]:
for i in response.get('hits').get('hits'):
    print(i.get('_source').get('page_content'))
    print(i.get('_source').get('metadata')['source'])
    print('---')
    i.append(list(rag_info[0]+"\nSource:\n"+rag_info[1]))

-	Referral letter (if any),
-	Any medical benefit identification documents (e.g. Civil Service Card, Medical Benefits Card, etc),
-	Update or cancel appointment,
-	To cancel or update your appointment, please call 6357 7000 or email Contact@ttsh.com.sg . Calls are available during these times Mondays to Fridays 8.00 am to 5.00 and Saturdays 8.00 am to 12.00 pm.
-	You may download HealthHub app to access health information and services conveniently from your mobile device.
https://www.ttsh.com.sg/Patients-and-Visitors/Your-Clinic-Visit/Pages/Appointments.aspx
---


AttributeError: 'dict' object has no attribute 'append'

In [30]:
# import pandas as pd
# import re
# import json

# df = pd.read_csv('medDataset_processed.csv')
# # Assuming df has 'Input' and 'Output' columns

# # Randomly select 10000 rows
# df = df.sample(n=10000, random_state=1)  # using random_state for reproducibility

# # Drop index
# df = df.reset_index(drop=True)

# # Drop NA values
# df = df.dropna()

# # Drop duplicates
# df = df.drop_duplicates()

# # Drop rows with empty strings
# df = df[(df['input'] != '') & (df['output'] != '')]

# # Check characters are identifiable
# df = df[df['input'].apply(lambda x: all(char.isprintable() for char in x))]

# # Check characters are letters, numbers, punctuation, whitespace or \n only
# pattern = re.compile(r'^[\w\s.,;:!?()\"\'\-\n]+$')

# def check_characters(text):
#     return bool(pattern.match(text))

# df = df[df['input'].apply(check_characters)]

# # Convert the DataFrame to a list of dictionaries
# data = df.to_dict(orient='records')

# # Save the list of dictionaries as a JSON file
# with open('medDataset_processed.json', 'w') as f:
#     json.dump(data, f, indent=2)

# print("JSON file created successfully.")


JSON file created successfully.


In [20]:
from ibmcloudant.cloudant_v1 import CloudantV1, Document
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_cloud_sdk_core import ApiException

import os
from dotenv import load_dotenv
load_dotenv()
IBM_CLOUD_APIKEY = os.getenv("IBM_CLOUD_APIKEY")
CLOUDANT_URL = os.getenv("CLOUDANT_URL")
url = CLOUDANT_URL

authenticator = IAMAuthenticator(IBM_CLOUD_APIKEY)

service = CloudantV1(authenticator=authenticator)

service.set_service_url(url)


In [21]:
# Create the patient_info database if it doesn't exist
db_name = "patient_info"

try:
    put_database_result = service.put_database(db=db_name).get_result()
    if put_database_result["ok"]:
        print(f'"{db_name}" database created.')
except ApiException as ae:
    if ae.status_code == 412:
        print(f'Cannot create "{db_name}" database, it already exists.')

# Create a sample document for patient info
patient_id = 1
visit_id = 1

patient_document = {
    "patient_id": patient_id,
    "nric": "S1234567A",
    "first_name": "Jane",
    "last_name": "Doe",
    "email": "janeDoe@gmail.com",
    "additional_info": ["smoker", "seafood allergy", "nut allergy", "pregnant"],
    "visits": [
        {
            "visit_id": visit_id,
            "prescription_info": [
                "Methotrexate, Dosage: 15 mg, Frequency: once a week, Instructions: Start with a low dose and monitor for side effects",
                "Ibuprofen, Dosage: 400 mg, Frequency: as needed, Instructions: for pain and swelling, not to exceed three times a day",
                "Folic acid, Dosage: 1 mg, Frequency: daily, Instructions: except on the day of Methotrexate, to reduce side effects"
            ],
            "visit_info": [
                "Rheumatoid Arthritis: Patient complains of persistent joint pain and stiffness, particularly in the hands and wrists, worsening in the morning lasting more than one hour. Doctors Remarks: Examination shows symmetrical joint swelling and tenderness in the wrists and MCP (metacarpophalangeal) joints. Symptoms and physical findings suggest an inflammatory process. Rheumatoid factor and anti-CCP (anti-cyclic citrullinated peptide) antibodies tests ordered to confirm diagnosis, along with CRP (C-reactive protein) and ESR (erythrocyte sedimentation rate) to assess inflammation levels."
            ]
        }
    ]
}

# Save the document in the patient_info database
create_document_response = service.post_document(
    db=db_name,
    document=patient_document
).get_result()

print(f"Document inserted: {create_document_response['id']}")


"patient_info" database created.
Document inserted: ab55193c6d2f7ae673b5afb63464195d


In [22]:
# Query the patient_info database
selector = {
    "patient_id": patient_id,
    "visits": {
        "$elemMatch": {
            "visit_id": visit_id
        }
    }
}

response = service.post_find(
    db=db_name,
    selector=selector
).get_result()

if response['docs']:
    patient_doc = response['docs'][0]
    visit_info = next(visit for visit in patient_doc['visits'] if visit['visit_id'] == visit_id)
    print(f"""
    visit_id = {visit_id}
    - prescription_info[]: {visit_info['prescription_info']}
    - visit_info[]: {visit_info['visit_info']}

    patient_id = {patient_doc['patient_id']}
    - NRIC: {patient_doc['nric']}
    - first_name: {patient_doc['first_name']}
    - last_name: {patient_doc['last_name']}
    - email: {patient_doc['email']}
    - additional_info[]: {patient_doc['additional_info']}
    """)
else:
    print("No documents found.")



    visit_id = 1
    - prescription_info[]: ['Methotrexate, Dosage: 15 mg, Frequency: once a week, Instructions: Start with a low dose and monitor for side effects', 'Ibuprofen, Dosage: 400 mg, Frequency: as needed, Instructions: for pain and swelling, not to exceed three times a day', 'Folic acid, Dosage: 1 mg, Frequency: daily, Instructions: except on the day of Methotrexate, to reduce side effects']
    - visit_info[]: ['Rheumatoid Arthritis: Patient complains of persistent joint pain and stiffness, particularly in the hands and wrists, worsening in the morning lasting more than one hour. Doctors Remarks: Examination shows symmetrical joint swelling and tenderness in the wrists and MCP (metacarpophalangeal) joints. Symptoms and physical findings suggest an inflammatory process. Rheumatoid factor and anti-CCP (anti-cyclic citrullinated peptide) antibodies tests ordered to confirm diagnosis, along with CRP (C-reactive protein) and ESR (erythrocyte sedimentation rate) to assess inflam

In [None]:
"""
visit_id = 1
- prescription_info[]: [
  "Methotrexate, Dosage: 15 mg, Frequency: once a week, Instructions: Start with a low dose and monitor for side effects",
  "Ibuprofen, Dosage: 400 mg, Frequency: as needed, Instructions: for pain and swelling, not to exceed three times a day",
  "Folic acid, Dosage: 1 mg, Frequency: daily, Instructions: except on the day of Methotrexate, to reduce side effects"
]
- visit_info[]: [
  "Rheumatoid Arthritis: Patient complains of persistent joint pain and stiffness, particularly in the hands and wrists, worsening in the morning lasting more than one hour. Doctors Remarks: Examination shows symmetrical joint swelling and tenderness in the wrists and MCP (metacarpophalangeal) joints. Symptoms and physical findings suggest an inflammatory process. Rheumatoid factor and anti-CCP (anti-cyclic citrullinated peptide) antibodies tests ordered to confirm diagnosis, along with CRP (C-reactive protein) and ESR (erythrocyte sedimentation rate) to assess inflammation levels."
]

patient_id = 1
- NRIC: "S1234567A"
- first_name: "Jane"
- last_name: "Doe"
- email: "janeDoe@gmail.com"
- additional_info[] : [  "smoker",  "seafood allergy",  "nut allergy",  "pregnant"]

"""