In [1]:
import numpy as np
import pandas as pd


In [2]:
!pip install -qU \
    langchain \
    openai \
    datasets \
    pinecone-client \
    tiktoken

In [3]:
!pip install -qU langchain-openai

In [4]:
!pip install -qU PyPDF2
import PyPDF2

In [5]:
import os
from langchain_openai import ChatOpenAI
from google.colab import userdata
from langchain.chat_models import ChatOpenAI
from datetime import datetime
import hashlib

In [6]:
# Get the current drive path.
drive_path = "/content/drive/MyDrive"

# Set the current directory to the specified path.
os.chdir(os.path.join(drive_path, "Colab Notebooks/MIMIC/mimicdata/mimic3"))

curr_path = os.getcwd()


In [7]:
# Load

df_disch = pd.read_csv(os.path.join(curr_path,"train_50.csv"))
df_disch.head()



Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,LABELS,length
0,7908,182396,admission date discharge date date of birth se...,287.5;45.13;584.9,105
1,11231,183363,admission date discharge date date of birth se...,272.4;96.71;401.9,106
2,3184,144347,admission date discharge date date of birth se...,530.81,117
3,24427,177066,admission date discharge date date of birth se...,V58.61;96.71;96.04;276.2,148
4,1262,183373,admission date discharge date service neurolog...,V58.61;96.71;401.9;414.01;244.9;427.31,156


In [8]:
def reformat(code, is_diag):
    """
        Put a period in the right place because the MIMIC-3 data files exclude them.
        Generally, procedure codes have dots after the first two digits,
        while diagnosis codes have dots after the first three digits.
    """
    code = ''.join(code.split('.'))
    if is_diag:
        if code.startswith('E'):
            if len(code) > 4:
                code = code[:4] + '.' + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + '.' + code[3:]
    else:
        code = code[:2] + '.' + code[2:]
    return code


In [9]:

df_diagnosis = pd.read_csv(os.path.join(drive_path, "Colab Notebooks/Capstone/CMS32_DESC_LONG_SHORT_DIAGNOSIS.csv"), usecols=[0, 1, 2])
df_diagnosis['DIAGNOSIS_CODE'] = df_diagnosis.apply(lambda row: reformat(row['DIAGNOSIS_CODE'], True), axis=1)
df_diagnosis.head()


Unnamed: 0,DIAGNOSIS_CODE,LONG_DESCRIPTION,SHORT_DESCRIPTION
0,1.0,Cholera due to vibrio cholerae,Cholera d/t vib cholerae
1,1.1,Cholera due to vibrio cholerae el tor,Cholera d/t vib el tor
2,1.9,"Cholera, unspecified",Cholera NOS
3,2.0,Typhoid fever,Typhoid fever
4,2.1,Paratyphoid fever A,Paratyphoid fever a


In [10]:
# prompt: filter df_diagnosis where LONG_DESCRIPTION contains appendix

df_appendix = df_diagnosis[df_diagnosis['LONG_DESCRIPTION'].str.contains('appendicitis', case=False, na=False)]
df_appendix

Unnamed: 0,DIAGNOSIS_CODE,LONG_DESCRIPTION,SHORT_DESCRIPTION
5684,540.0,Acute appendicitis with generalized peritonitis,Ac append w peritonitis
5685,540.1,Acute appendicitis with peritoneal abscess,Abscess of appendix
5686,540.9,Acute appendicitis without mention of peritonitis,Acute appendicitis NOS
5687,541.0,"Appendicitis, unqualified",Appendicitis NOS
5688,542.0,Other appendicitis,Other appendicitis


In [11]:
df_procedures = pd.read_csv(os.path.join(drive_path, "Colab Notebooks/Capstone/CMS32_DESC_LONG_SHORT_PROCEDURES.csv"), usecols=[0, 1, 2])
df_procedures['PROCEDURE_CODE'] = df_procedures['PROCEDURE_CODE'].astype(str)
df_procedures['PROCEDURE_CODE'] = df_procedures.apply(lambda row: reformat(row['PROCEDURE_CODE'], False), axis=1)
df_procedures.head()

Unnamed: 0,PROCEDURE_CODE,LONG_DESCRIPTION,SHORT_DESCRIPTION
0,1.0,Therapeutic ultrasound of vessels of head and ...,Ther ult head & neck ves
1,2.0,Therapeutic ultrasound of heart,Ther ultrasound of heart
2,3.0,Therapeutic ultrasound of peripheral vascular ...,Ther ult peripheral ves
3,9.0,Other therapeutic ultrasound,Other therapeutic ultsnd
4,10.0,Implantation of chemotherapeutic agent,Implant chemothera agent


In [12]:
from tqdm import tqdm

# Open the PDF file in read-binary mode
with open('/content/drive/MyDrive/Colab Notebooks/Capstone/icd9cm_guidelines_2024.pdf', 'rb') as file:

    # Create a PDF reader object
    reader = PyPDF2.PdfReader(file)

    # Initialize an empty string to hold the text
    icd_guidelines = ''
    # Loop through each page in the PDF file
    for page in tqdm(reader.pages):
        text = page.extract_text()
        icd_guidelines += '\n\n' + text[text.find(' ]')+2:]

# Print the final string containing all the text from the PDF file
icd_guidelines = icd_guidelines.strip()

print(len(icd_guidelines))

100%|██████████| 102/102 [00:03<00:00, 28.82it/s]

219227





In [13]:
# Importing the tiktoken library
import tiktoken

# Initializing a tokenizer for the 'cl100k_base' model
# This tokenizer is designed to work with the 'ada-002' embedding model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Using the tokenizer to encode the text 'hey there'
# The resulting output is a list of integers representing the encoded text
# This is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there')

[36661, 1070]

In [14]:
import re

# Function to split the text into chunks of a maximum number of tokens. Inspired by OpenAI
def overlapping_chunks(text, max_tokens = 500, overlapping_factor = 5):
    '''
    max_tokens: tokens we want per chunk
    overlapping_factor: number of sentences to start each chunk with that overlaps with the previous chunk
    '''

    # Split the text using punctuation
    sentences = re.split(r'[.?!]', text)

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]

    chunks, tokens_so_far, chunk = [], 0, []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks

In [15]:
split = overlapping_chunks(icd_guidelines, overlapping_factor=0)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'non-overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')


non-overlapping chunking approach has 109 documents with average length 476.7 tokens


In [16]:
split = overlapping_chunks(icd_guidelines)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

overlapping chunking approach has 151 documents with average length 496.5 tokens


In [17]:
print(split[3])

9.  “Multiple coding for a single conditi on”.  
7.  “And”  
The word “and” should be interpreted to mean either “and” or “or” when it 
appears in a title.   


ICD-9-CM Official Guidelines for Coding and Reporting  
Effective October 1, 2011  
Page 9 of 107 8.  “With”  
The word “with” should be interpreted to mean “associated with” or “due to” 
when it appears in a code title, the Alphabetic Index, or an instruc tional note 
in the Tabular List.   
 
The word “with” in the alphabetic index is sequenced immediately following 
the main term, not in alphabetical order.   
9.  “See” and “See Also”  
The “see” instruction following a main term in the index indicates that another 
term should be referenced.   It is necessary to go to the main term referenced 
with the “see” note to locate the correct code.    
 
A “see also” instruction following a main term in the index instructs that there 
is another main term that may also be referenced that may provide additional 
index entries that m

**Chatbot build out**

We will be relying heavily on the LangChain library to bring together the different components needed for our chatbot. To begin, we'll create a simple chatbot without any retrieval augmentation

In [18]:
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

  warn_deprecated(


In [19]:
from langchain_openai import OpenAIEmbeddings


ENGINE = userdata.get('ENGINE')

embed_model = OpenAIEmbeddings(model=ENGINE)

In [20]:
disch_text = """
brief hospital course he was admitted to the trauma team orthopedics was consulted for the femur fracture
and he was taken to the operating room on and for repair of these injuries he is receiving heparin subcutaneously
for dvt prophylaxis he is on a regular diet and taking oral narcotics for pain postoperatively he has been slow to
progress he was evaluated by physical therapy and is being recommended for rehab medications on admission synthroid
"""

example_text = """
[Example Response]: 540.1 - Abscess of appendix
     47.01 - Lap appendectomy """


combined_text = "[Discharge Note]:" + "\n\n"  + disch_text + "\n\n"

In [23]:
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

combined_text = "[Discharge Note]:" + "\n\n"  + disch_text + "\n\n"
messages = [
    SystemMessage(content="""You are a medical coder, consider the discharge note and assign the appropriate ICD-9 codes, responding
    with their exact descriptions. Follow the example in the example precisely.
    [Example Response]: 540.1 - Abscess of appendix
     47.01 - Lap appendectomy """),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content=combined_text)
]




In [76]:
messages3 = [
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    SystemMessage(content="You are a helpful assistant who will be coding for ICD-9 procedure and diagnosis codes given a patients clincial discharge summary."),
    HumanMessage(content="Can you help me with identifying ICD-9 procedure codes and diagnosis codes for the discharge summary below? \n\n"),
    HumanMessage(content=combined_text)
]
#HumanMessage(content="Can you provide ICD-9 procedure codes and diagnosis codes for the  discharge summary given as part of the query?")


In [77]:
# send to OpenAI
res = chat(messages3)

In [27]:
#disch_text = df_disch.iloc[0]['TEXT']
#disch_text

In [None]:
# add latest AI response to messages
messages.append(res)

# now create a new user prompt
prompt = HumanMessage(
    content=disch_text)

# add to messages
messages.append(prompt)

# send to OpenAI
res = chat(messages)

In [78]:
print(res.content)

To determine the appropriate ICD-9 procedure and diagnosis codes based on the provided discharge summary, we need to identify the key diagnoses and procedures mentioned in the text. 

Key Diagnosis: 
1. Femur fracture
2. DVT prophylaxis
3. Postoperative pain management
4. Slow progress postoperatively
5. Evaluation by physical therapy
6. Recommended for rehab
7. Medication: Synthroid

Key Procedures:
1. Repair of femur fracture
2. Heparin subcutaneous injection for DVT prophylaxis

Based on the information provided, here are some possible ICD-9 codes that may be relevant:
- Femur fracture: ICD-9-CM diagnosis code 821.01 (Closed fracture of neck of femur)
- DVT prophylaxis: ICD-9-CM diagnosis code V58.61 (Long-term (current) use of anticoagulants)
- Postoperative pain management: ICD-9-CM diagnosis code 338.29 (Other chronic postoperative pain)
- Slow progress postoperatively: ICD-9-CM diagnosis code 998.89 (Other specified complications of procedures, not elsewhere classified)
- Evalua

In [None]:
messages.append(res)

prompt = HumanMessage(
    content="Yes, provide ICD9 procedure codes and diagnosis codes from the discharge clinical summary notes ")

messages.append(prompt)

res = chat(messages)


In [None]:
print(res.content)

I'm sorry, but I am unable to access external databases or specific medical records to provide ICD-9 procedure codes and diagnosis codes from discharge clinical summary notes. I recommend consulting with a healthcare provider or medical coder who has access to the patient's medical records in order to obtain the accurate ICD-9 codes for the procedures and diagnoses mentioned in the discharge summary.


In [None]:
# get the ICD-9 codes for the discharge note and pass it with the prompt
first_row_labels_value = df_disch.LABELS.iloc[0]
print(first_row_labels_value)


287.5;45.13;584.9


In [None]:

# add latest AI response to messages
messages.append(res)

# now create a new user prompt

prompt = HumanMessage(
    content="Here is the ICD9 diagnosis and procedure codes for the discharge summary notes \
    seperated by semicolons: " + first_row_labels_value)

# add to messages
messages.append(prompt)

# send to OpenAI
res = chat(messages)


In [None]:
# initialize connection to pinecone (get API key at app.pinecone.io)

print(res.content)

Thank you for providing the ICD-9 diagnosis and procedure codes. Here they are:

Diagnosis codes:
- 287.5 (Thrombocytopenia, unspecified)
- 45.13 (Injection of anesthetic into extradural space)
- 584.9 (Acute kidney failure, unspecified)

These codes represent the diagnoses and procedures mentioned in the discharge summary notes. If you have any more information or need further assistance, feel free to let me know.


**Build the clinical discharge knowledge base**


---



We now have a dataset that can serve as our chatbot knowledge base. Our next task is to transform that dataset into the knowledge base that our chatbot can use. To do this we must use an embedding model and vector database.

In [31]:
def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

In [25]:
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
os.environ["PINE_CONE_KEY"] = userdata.get("PINE_CONE_KEY")

api_key = os.getenv("PINE_CONE_KEY")

# configure client
pc = Pinecone(api_key=api_key)



In [26]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-west-2"
)

In [29]:
def prepare_for_pinecone(texts, engine=ENGINE):
    # Get the current UTC date and time
    now = datetime.utcnow()

    # Generate vector embeddings for each string in the input list, using the specified engine
    embeddings = embed_model.embed_documents(texts)

    # Create tuples of (hash, embedding, metadata) for each input string and its corresponding vector embedding
    # The my_hash() function is used to generate a unique hash for each string, and the datetime.utcnow() function is used to generate the current UTC date and time
    return [
        (
            my_hash(text),  # A unique ID for each string, generated using the my_hash() function
            embedding,  # The vector embedding of the string
            dict(text=text, date_uploaded=now)  # A dictionary of metadata, including the original text and the current UTC date and time
        )
        for text, embedding in zip(texts, embeddings)  # Iterate over each input string and its corresponding vector embedding
    ]

In [32]:
def upload_texts_to_pinecone(texts, batch_size=None, show_progress_bar=False):
    # Call the prepare_for_pinecone function to prepare the input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        batch = texts[i: i + batch_size]
        prepared_texts = prepare_for_pinecone(batch)

        # Use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted += index.upsert(
            prepared_texts
        )['upserted_count']

    return total_upserted

In [32]:
import time

index_name = 'guideline-index'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 151}},
 'total_vector_count': 151}

In [33]:
_id, embedding, metadata = prepare_for_pinecone(split)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)

ID:   14a670503a7b2159359a1fe9d2c100ce 
LEN:  1536 
META: {'text': 'Brackets are used in the tabular list to enclose synonyms, alternative \nwording or explanatory phrases.   Brackets are used in the index to \nidentify manifestation codes.   \n(See Section I. A. 6.  “Etiology/manifestations”)  \n \n( ) Parentheses are used in both the index and tabular to enclose \nsupplementary words that may be present or absent in the statement of a \ndisease or procedure without affecting the code number to which it is \n\n\nICD-9-CM Official Guidelines for Coding and Reporting  \nEffective October 1, 2011  \nPage 7 of 107 assigned.   The terms within the parentheses are ref erred to as \nnonessential modifiers.   \n: Colons are used in the Tabular list after an incomplete term which needs \none or more of the modifiers following the colon to make it assignable to \na given category.   \n4.  Includes and Excludes Notes and Inclusion terms  \nInclu des: This note appears immediately under a three -

In [34]:
vector_count = index.describe_index_stats().total_vector_count

In [35]:
# Call the upload_texts_to_pinecone() function with the input texts

if vector_count == 0:
  upload_texts_to_pinecone(split)

In [36]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 151}},
 'total_vector_count': 151}

In [None]:

# load the guidelines into pinecone
batch_size = 100

data = pd.DataFrame(split)

for i in tqdm(range(0, len(data), batch_size)):
    now = datetime.utcnow()
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [my_hash(x[0]) for i, x in batch.iterrows()]
    # get text to embed
    texts = [x[0] for _, x in batch.iterrows()]
    # embed text
    #embeds = embed_text(texts)

    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'guideline_id': ids,
         'guideline_note': x[0]} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/2 [00:00<?, ?it/s]

In [37]:
# prompt: create a function that will accept a query string, embed it using the same embedding model and qury the pinecone index

def query_from_pinecone(query, index, top_k=3):
    # get embedding from THE SAME embedder as the documents
    query_embedding = embed_model.embed_documents([query])[0]


    return index.query(
      vector=query_embedding,
      top_k=top_k,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')


In [38]:
#results_from_pinecone = query_from_pinecone("A patient with acute appendicitis who underwent laparoscopic appendectomy.",index,5)

results_from_pinecone = query_from_pinecone(disch_text,index,5)

for result_from_pinecone in results_from_pinecone:
    print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:100]}")



63f9175150241c1f6f8bbfe8113daed5	0.79	1, V54. 8, or V54. 9) for encounters after the patient 
has completed active treatment of the fractu
cc0834831307280bf8416afc8d3ee1fd	0.79	   
1) Superficial injuries  
Superficial injuries such as abrasions or contusions are not 
coded wh
37da06f95a06de430465b5ae6d143a35	0.79	, buttock, hee l, shoulder) and each pressure ulcer is 
documented as being at different stages (e. 
697f6cfc3394fff28f20ef8af92ce097	0.79	C17. f. 2 .  for information on the coding 
of organ transplant complications.   
 
Assign code V45.
c995dfcb9306f791f71d31a745db2bf2	0.79	g. , patient fell out of hospital bed 
during hospital stay, patient experienced an adverse reaction


In [39]:
index_name = 'medcode-index'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index_medcode = pc.Index(index_name)
time.sleep(1)
# view index stats
index_medcode.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1573}},
 'total_vector_count': 1573}

In [40]:

#df_all_notes = pd.read_csv(os.path.join(curr_path,"notes_labeled.csv"))
#df_all_notes.head()


df_disch = pd.read_csv(os.path.join(curr_path,"dev_50.csv"))
df_disch.head()


Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,LABELS,length
0,86006,111912,admission date discharge date date of birth se...,V58.61;96.71;401.9;414.01;427.31,230
1,85950,189769,admission date discharge date service neurosur...,403.90;96.71;V45.81;585.9;250.00,304
2,88025,180431,admission date discharge date date of birth se...,96.71;38.93;518.81,359
3,83776,152868,admission date discharge date date of birth se...,272.4;96.71;401.9;518.81,408
4,85055,169373,admission date discharge date date of birth se...,99.04;96.71,409


In [41]:
df_disch.shape

(1573, 5)

In [42]:
# testing the dimensions of the embeddings
texts = disch_text
#texts = "A patient with acute appendicitis who underwent laparoscopic appendectomy."
res = embed_model.embed_documents(texts)
len(res), len(res[0])

(453, 1536)

In [43]:
from tqdm.auto import tqdm  # for progress bar

In [122]:
# load the discharge summaries into pinecone
batch_size = 90

data = df_disch


for i in tqdm(range(0, len(data), batch_size)):
    now = datetime.utcnow()
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [my_hash(x[2]) for i, x in batch.iterrows()]
    # get text to embed
    texts = [x[2] for _, x in batch.iterrows()]
    # embed text
    #embeds = embed_text(texts)

    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'disch_id': ids,
         'disch_note': x[2],
         'disch_codes': x[3]} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index_medcode.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/18 [00:00<?, ?it/s]

In [44]:
index_medcode.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1573}},
 'total_vector_count': 1573}

In [None]:

index2_name = 'diag-index'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index2_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index2_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index2_name).status['ready']:
        time.sleep(1)

# connect to index
index2 = pc.Index(index2_name)
time.sleep(1)
# view index stats
index2.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 14567}},
 'total_vector_count': 14567}

In [None]:
index3_name = 'proc-index'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index3_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index3_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index3_name).status['ready']:
        time.sleep(1)

# connect to index
index3 = pc.Index(index3_name)
time.sleep(1)
# view index stats
index3.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [45]:
#create cosine index in pinecone for diagnosis
diag_index_c = 'diag-index-c'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if diag_index_c not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        diag_index_c,
        dimension=1536,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(diag_index_c).status['ready']:
        time.sleep(1)

# connect to index
diag_index_c = pc.Index(diag_index_c)
time.sleep(1)
# view index stats
diag_index_c.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 14567}},
 'total_vector_count': 14567}

In [46]:
proc_index_c = 'proc-index-c'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if proc_index_c not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        proc_index_c,
        dimension=1536,  # dimensionality of ada 002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index3_name).status['ready']:
        time.sleep(1)

# connect to index
proc_index_c = pc.Index(proc_index_c)
time.sleep(1)
# view index stats
proc_index_c.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3809}},
 'total_vector_count': 3809}

In [47]:
df_diagnosis.head()

Unnamed: 0,DIAGNOSIS_CODE,LONG_DESCRIPTION,SHORT_DESCRIPTION
0,1.0,Cholera due to vibrio cholerae,Cholera d/t vib cholerae
1,1.1,Cholera due to vibrio cholerae el tor,Cholera d/t vib el tor
2,1.9,"Cholera, unspecified",Cholera NOS
3,2.0,Typhoid fever,Typhoid fever
4,2.1,Paratyphoid fever A,Paratyphoid fever a


In [None]:

batch_size = 100

for i in tqdm(range(0, len(df_diagnosis), batch_size)):
    i_end = min(len(df_diagnosis), i+batch_size)
    # get batch of data
    batch = df_diagnosis.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [str(x['DIAGNOSIS_CODE']) for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['LONG_DESCRIPTION'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'diagnosis_code': x['DIAGNOSIS_CODE'],
         'long_description': x['LONG_DESCRIPTION'],
         'short_description': x['SHORT_DESCRIPTION']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index2.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/146 [00:00<?, ?it/s]

In [None]:
index2.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 14567}},
 'total_vector_count': 14567}

In [48]:
df_procedures.head()

Unnamed: 0,PROCEDURE_CODE,LONG_DESCRIPTION,SHORT_DESCRIPTION
0,1.0,Therapeutic ultrasound of vessels of head and ...,Ther ult head & neck ves
1,2.0,Therapeutic ultrasound of heart,Ther ultrasound of heart
2,3.0,Therapeutic ultrasound of peripheral vascular ...,Ther ult peripheral ves
3,9.0,Other therapeutic ultrasound,Other therapeutic ultsnd
4,10.0,Implantation of chemotherapeutic agent,Implant chemothera agent


In [None]:
batch_size = 100

for i in tqdm(range(0, len(df_procedures), batch_size)):
    i_end = min(len(df_procedures), i+batch_size)
    # get batch of data
    batch = df_procedures.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [str(x['PROCEDURE_CODE']) for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['LONG_DESCRIPTION'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'procedure_code': x['PROCEDURE_CODE'],
         'long_description': x['LONG_DESCRIPTION'],
         'short_description': x['SHORT_DESCRIPTION']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index3.upsert(vectors=zip(ids, embeds, metadata))


  0%|          | 0/39 [00:00<?, ?it/s]

In [45]:
index3.describe_index_stats()

NameError: name 'index3' is not defined

In [None]:

batch_size = 100

for i in tqdm(range(0, len(df_procedures), batch_size)):
    i_end = min(len(df_procedures), i+batch_size)
    # get batch of data
    batch = df_procedures.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [str(x['PROCEDURE_CODE']) for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['LONG_DESCRIPTION'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'diagnosis_code': x['PROCEDURE_CODE'],
         'long_description': x['LONG_DESCRIPTION'],
         'short_description': x['SHORT_DESCRIPTION']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    proc_index_c.upsert(vectors=zip(ids, embeds, metadata))

100%|██████████| 39/39 [00:53<00:00,  1.37s/it]


In [49]:
proc_index_c.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3809}},
 'total_vector_count': 3809}

In [50]:
disch_text

'\nbrief hospital course he was admitted to the trauma team orthopedics was consulted for the femur fracture\nand he was taken to the operating room on and for repair of these injuries he is receiving heparin subcutaneously\nfor dvt prophylaxis he is on a regular diet and taking oral narcotics for pain postoperatively he has been slow to\nprogress he was evaluated by physical therapy and is being recommended for rehab medications on admission synthroid \n'

In [51]:
results_from_pinecone = query_from_pinecone(disch_text,proc_index_c,5)

for result_from_pinecone in results_from_pinecone:
    print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['diagnosis_code']}\t{result_from_pinecone['metadata']['long_description'][:50]}")



39.62	0.82	39.62	Hypothermia (systemic) incidental to open heart su
79.35	0.81	79.35	Open reduction of fracture with internal fixation,
79.15	0.81	79.15	Closed reduction of fracture with internal fixatio
81.21	0.81	81.21	Arthrodesis of hip
79.65	0.81	79.65	Debridement of open fracture site, femur


In [52]:
results_from_pinecone = query_from_pinecone(disch_text,proc_index_c,5)

for result_from_pinecone in results_from_pinecone:
    print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['diagnosis_code']}\t{result_from_pinecone['metadata']['long_description'][:50]}")


39.62	0.82	39.62	Hypothermia (systemic) incidental to open heart su
79.35	0.81	79.35	Open reduction of fracture with internal fixation,
79.15	0.81	79.15	Closed reduction of fracture with internal fixatio
81.21	0.81	81.21	Arthrodesis of hip
79.65	0.81	79.65	Debridement of open fracture site, femur


In [None]:
#load diagnosis_c index

batch_size = 100

for i in tqdm(range(0, len(df_diagnosis), batch_size)):
    i_end = min(len(df_diagnosis), i+batch_size)
    # get batch of data
    batch = df_diagnosis.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [str(x['DIAGNOSIS_CODE']) for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['LONG_DESCRIPTION'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'diagnosis_code': x['DIAGNOSIS_CODE'],
         'long_description': x['LONG_DESCRIPTION'],
         'short_description': x['SHORT_DESCRIPTION']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    diag_index_c.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/146 [00:00<?, ?it/s]

In [53]:
diag_index_c.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 14567}},
 'total_vector_count': 14567}

In [54]:
results_from_pinecone = query_from_pinecone(disch_text,diag_index_c,5)

for result_from_pinecone in results_from_pinecone:
    print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['diagnosis_code']}\t{result_from_pinecone['metadata']['long_description'][:50]}")



V54.13	0.84	V54.13	Aftercare for healing traumatic fracture of hip
V54.14	0.83	V54.14	Aftercare for healing traumatic fracture of leg, u
V54.23	0.83	V54.23	Aftercare for healing pathologic fracture of hip
820.22	0.83	820.22	Closed fracture of subtrochanteric section of neck
244.0	0.83	244.0	Postsurgical hypothyroidism


**Retrieval Augmented Generation**



---



We've built a fully-fledged knowledge base. Now it's time to connect that knowledge base to our chatbot. To do that we'll be diving back into LangChain and reusing our template prompt from earlier.

In [55]:
!pip install -qU langchain_pinecone
from langchain.vectorstores import Pinecone


text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

  warn_deprecated(


In [56]:
# diagnosis code vector store
text_field2 = "long_description"  # the metadata field that contains our text

# initialize the vector store object
vectorstore_d = Pinecone(
    diag_index_c, embed_model.embed_query, text_field2)


Cross Encoding while searching the vectorstore

In [55]:
!pip install -qU sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/156.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m153.6/156.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [57]:
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np
from torch import nn

In [58]:
# Pre-trained cross encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')


config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [59]:
def get_results_from_pinecone(query, index, top_k=3, re_rank=False, verbose=True):

    results_from_pinecone = query_from_pinecone(query, index, top_k=top_k)
    if not results_from_pinecone:
        return []

    if verbose:
        print("Query:", query)


    final_results = []

    if re_rank:
        if verbose:
            print('Document ID (Hash)\t\tRetrieval Score\tCE Score\tText')

        sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]

        # Compute the similarity scores for these combinations
        similarity_scores = cross_encoder.predict(sentence_combinations, activation_fct=nn.Sigmoid())

        # Sort the scores in decreasing order
        sim_scores_argsort = reversed(np.argsort(similarity_scores))

        # Print the scores
        for idx in sim_scores_argsort:
            result_from_pinecone = results_from_pinecone[idx]
            final_results.append(result_from_pinecone)
            if verbose:
                print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{similarity_scores[idx]:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
        return final_results

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

    return final_results


In [60]:
query = disch_text

In [61]:
final_results = get_results_from_pinecone(query, index ,20, True)

Query: 
brief hospital course he was admitted to the trauma team orthopedics was consulted for the femur fracture
and he was taken to the operating room on and for repair of these injuries he is receiving heparin subcutaneously
for dvt prophylaxis he is on a regular diet and taking oral narcotics for pain postoperatively he has been slow to
progress he was evaluated by physical therapy and is being recommended for rehab medications on admission synthroid 

Document ID (Hash)		Retrieval Score	CE Score	Text
b6f30a51febca5fffb7e222fb3ef5a8e	0.77	0.01	  
 
Fractures are coded using the aftercare codes
cc0834831307280bf8416afc8d3ee1fd	0.79	0.00	   
1) Superficial injuries  
Superficial injuries
63f9175150241c1f6f8bbfe8113daed5	0.79	0.00	1, V54. 8, or V54. 9) for encounters after the pat
6e620d94625b384162d918325c9e96b9	0.78	0.00	  
 
Note:  This guideline is applicable only to i
0a426ca5904a0782728d45fc507093c5	0.77	0.00	 Admission from Outpatient Surgery  
When a patien
4ee3a1296d6b41bc3f8

In [64]:
#query = disch_text
#vectorstore_d.similarity_search(query)

#query = "How should I code for a patient with diabetes mellitus type 2 and diabetic foot ulcer?"
results_from_pinecone_g = query_from_pinecone(query,index,3)

for result_from_pinecone_g in results_from_pinecone_g:
    print(f"{result_from_pinecone_g['id']}\t{result_from_pinecone_g['score']:.2f}\t{result_from_pinecone_g['metadata']['text']}")



63f9175150241c1f6f8bbfe8113daed5	0.79	1, V54. 8, or V54. 9) for encounters after the patient 
has completed active treatment of the fracture and is receiving 
routine care for the fracture during the heali ng or recovery 
phase.  Examples of fracture aftercare are: cast change or 
removal, removal of external or internal fixation device, 
medication adjustment, and follow up visits following fracture 
treatment.   
 
Care for complications of surgical treatment for f racture repairs 
during the healing or recovery phase should be coded with the 
appropriate complication codes.   


ICD-9-CM Official Guidelines for Coding and Reporting  
Effective October 1, 2011  
Page 58 of 107  
Care of complications of fractures, such as malunion and 
nonunion, should be reported with the appropriate codes.   
 
Pathologic fractures are not c oded in the 800 -829 range, but 
instead are assigned to subcategory 733. 1.   See Section I. C. 13. a 
for additional information.   
2) Multiple fractures of

In [141]:
query = "brief hospital course ct scan revealed very severe iph given her poor prognosis with fixed pupils and posturing patient was made cmo by family she expired shortly after arrival to hospital"

results_from_pinecone_ds = query_from_pinecone(query,index_medcode,1)

for result_from_pinecone_ds in results_from_pinecone_ds:
    print(f"{result_from_pinecone_ds['id']}\t{result_from_pinecone_ds['score']:.2f}\t{result_from_pinecone_ds['metadata']['disch_note']}\t{result_from_pinecone_ds['metadata']['disch_codes']}")




e88417bec626ecdee39a37e394e7326a	0.87	admission date discharge date date of birth sex f service surgery allergies patient recorded as having no known allergies to drugs attending first name3 lf chief complaint 60f on coumadin was found slightly drowsy tonight then fell down stairs paramedic found her unconscious and she was intubated w o any medication head ct shows multiple iph transferred to hospital1 for further eval major surgical or invasive procedure none past medical history her medical history is significant for hypertension osteoarthritis involving bilateral knee joints with a dependence on cane for ambulation chronic back pain she also has a history of a right lung cancer requiring right lobectomy in no metastasis was known and she has since recovered well and is considered cured social history unknown family history nc physical exam physical exam intubated non sedated received no paralytic medication no eye opening pupil rt mm lt mm both non reactive corneal bilat extends bo

In [59]:
#query = "Clinical discharge note has Chronic obstructive pulmonary disease and bronchiectasis. Can you help me identify diagnosis code?"

query = disch_text
#vectorstore_d.similarity_search(query)

results_from_pinecone_d = query_from_pinecone(query,diag_index_c,10)

for result_from_pinecone_d in results_from_pinecone_d:
    print(f"{result_from_pinecone_d['id']}\t{result_from_pinecone_d['score']:.2f}\t{result_from_pinecone_d['metadata']['diagnosis_code']}\t{result_from_pinecone_d['metadata']['long_description']}")




V54.13	0.84	V54.13	Aftercare for healing traumatic fracture of hip
V54.14	0.83	V54.14	Aftercare for healing traumatic fracture of leg, unspecified
V54.23	0.83	V54.23	Aftercare for healing pathologic fracture of hip
820.22	0.83	820.22	Closed fracture of subtrochanteric section of neck of femur
244.0	0.83	244.0	Postsurgical hypothyroidism
V58.43	0.83	V58.43	Aftercare following surgery for injury and trauma
821.11	0.83	821.11	Open fracture of shaft of femur
V54.17	0.83	V54.17	Aftercare for healing traumatic fracture of vertebrae
821.01	0.82	821.01	Closed fracture of shaft of femur
V54.24	0.82	V54.24	Aftercare for healing pathologic fracture of leg, unspecified


In [63]:
# procedure code vector store

text_field3 = "long_description"  # the metadata field that contains our text

# initialize the vector store object
vectorstore_p = Pinecone(
    index3, embed_model.embed_query, text_field3)


NameError: name 'index3' is not defined

In [66]:
results_from_pinecone_p = query_from_pinecone(query,proc_index_c,10)

for result_from_pinecone_p in results_from_pinecone_p:
    print(f"{result_from_pinecone_p['id']}\t{result_from_pinecone_p['score']:.2f}\t{result_from_pinecone_p['metadata']['diagnosis_code']}\t{result_from_pinecone_p['metadata']['long_description']}")



39.62	0.82	39.62	Hypothermia (systemic) incidental to open heart surgery
79.35	0.81	79.35	Open reduction of fracture with internal fixation, femur
79.15	0.81	79.15	Closed reduction of fracture with internal fixation, femur
81.21	0.81	81.21	Arthrodesis of hip
79.65	0.81	79.65	Debridement of open fracture site, femur
72.	0.81	72.	Revision of hip replacement, femoral component
81.53	0.81	81.53	Revision of hip replacement, not otherwise specified
65.2	0.81	65.2	Complete substernal thyroidectomy
35.3	0.81	35.3	Repair of vertebral fracture
79.30	0.80	79.30	Open reduction of fracture with internal fixation, unspecified site


In [None]:
#vectorstore_p.similarity_search(query)

[Document(page_content='Extracorporeal hepatic assistance', metadata={'procedure_code': '50.92', 'short_description': 'Extracorpor hepat Assis'}),
 Document(page_content='Hepatotomy', metadata={'procedure_code': '50.0', 'short_description': 'Hepatotomy'}),
 Document(page_content='Percutaneous hepatic cholangiogram', metadata={'procedure_code': '87.51', 'short_description': 'Perc hepat Cholangiogram'}),
 Document(page_content='Liver scan and radioisotope function study', metadata={'procedure_code': '92.02', 'short_description': 'Liver scan/isotope funct'})]

In [57]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    #results = vectorstore.similarity_search(query, k=3)
    results_d = query_from_pinecone(query,diag_index_c,5)
    results_p = query_from_pinecone(query,proc_index_c,5)
    results_g = query_from_pinecone(query,index,3)
    results_ds = query_from_pinecone(query,index_medcode,1)

    # get the text from the results
    diagnosis_knowledge = "\n".join([x.metadata['long_description'] for x in results_d])
    procedure_knowledge = "\n".join([x.metadata['long_description'] for x in results_p])
    guidelines_knowledge = "\n".join([x.metadata['text'] for x in results_g])



    #get the medata from the results
    metadata_d = [x.metadata['diagnosis_code'] for x in results_d]
    metadata_p = [x.metadata['diagnosis_code'] for x in results_p]
    metadata_g = [x.metadata['text'] for x in results_g]
    metadata_ds = [x.metadata['disch_codes'] for x in results_ds]


    # feed into an augmented prompt
    augmented_prompt = f"""Using the context below, answer the query.


    Contexts:
    {guidelines_knowledge}

    Metadata:
    Diagnosis Codes: {metadata_d}
    Procedure Codes: {metadata_p}

    Discharge Summary Query:
    {query}"""

    return augmented_prompt

In [60]:


#query="How should I code for a patient with diabetes mellitus type 2 and diabetic foot ulcer?"
aug_text = augment_prompt(query)

In [61]:
query

'\nbrief hospital course he was admitted to the trauma team orthopedics was consulted for the femur fracture\nand he was taken to the operating room on and for repair of these injuries he is receiving heparin subcutaneously\nfor dvt prophylaxis he is on a regular diet and taking oral narcotics for pain postoperatively he has been slow to\nprogress he was evaluated by physical therapy and is being recommended for rehab medications on admission synthroid \n'

In [62]:
aug_text

"Using the context below, answer the query.\n\n\n    Contexts:\n    1, V54. 8, or V54. 9) for encounters after the patient \nhas completed active treatment of the fracture and is receiving \nroutine care for the fracture during the heali ng or recovery \nphase.  Examples of fracture aftercare are: cast change or \nremoval, removal of external or internal fixation device, \nmedication adjustment, and follow up visits following fracture \ntreatment.   \n \nCare for complications of surgical treatment for f racture repairs \nduring the healing or recovery phase should be coded with the \nappropriate complication codes.   \n\n\nICD-9-CM Official Guidelines for Coding and Reporting  \nEffective October 1, 2011  \nPage 58 of 107  \nCare of complications of fractures, such as malunion and \nnonunion, should be reported with the appropriate codes.   \n \nPathologic fractures are not c oded in the 800 -829 range, but \ninstead are assigned to subcategory 733. 1.   See Section I. C. 13. a \nfor 

In [126]:
messages2 = [
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    SystemMessage(content="You are a helpful assistant who will be coding for ICD-9 procedure and diagnosis codes given a patients clincial discharge summary."),
    HumanMessage(content="Can you help me with identifying ICD-9 procedure codes and diagnosis codes for the discharge summary below? \n\n")
]
#HumanMessage(content="Can you provide ICD-9 procedure codes and diagnosis codes for the  discharge summary given as part of the query?")


In [143]:
combined_text = "[Discharge Note]:" + "\n\n"  + query + "\n\n"
messages2 = [
     SystemMessage(content="""You are a medical coder, consider the discharge note and assign the appropriate ICD-9 codes, responding
    with their exact descriptions. Follow the example in the example precisely.
    [Example Response]: 540.1 - Abscess of appendix
     47.01 - Lap appendectomy """),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content=combined_text)
]


In [144]:
print(messages2)

[SystemMessage(content='You are a medical coder, consider the discharge note and assign the appropriate ICD-9 codes, responding \n    with their exact descriptions. Follow the example in the example precisely.\n    [Example Response]: 540.1 - Abscess of appendix\n     47.01 - Lap appendectomy '), HumanMessage(content='Hi AI, how are you today?'), AIMessage(content="I'm great thank you. How can I help you?"), HumanMessage(content='[Discharge Note]:\n\nbrief hospital course ct scan revealed very severe iph given her poor prognosis with fixed pupils and posturing patient was made cmo by family she expired shortly after arrival to hospital\n\n')]


In [145]:
# create a new user prompt
prompt = HumanMessage(
    content=augment_prompt(query)
)
# add to messages
messages2.append(prompt)

res = chat(messages2)

print(res.content)

I'm sorry, but I am unable to find the appropriate ICD-9 codes based on the provided Metadata and the given discharge summary. If you could provide more information or clarify your query, I would be happy to assist you further.


In [146]:
combined_text = "[Discharge Note]:" + "\n\n"  + query + "\n\n"
messages = [
    SystemMessage(content="""You are a medical coder, consider the discharge note and assign the appropriate ICD-9 codes, responding
    with their exact descriptions. Follow the example in the example precisely.
    [Example Response]: 540.1 - Abscess of appendix
     47.01 - Lap appendectomy """),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
    HumanMessage(content=combined_text)
]

In [None]:
messages = []

def ask_and_augment():
  while True:
    query = input("Enter your query: ")
    if query == "exit":
      break
    #combined_text = "[Discharge Note]:" + "\n\n"  + query + "\n\n"
    messages = [
      SystemMessage(content="""You are a medical coder, consider the discharge note and assign the appropriate ICD-9 codes, responding
      with their exact descriptions. Follow theex example precisely.
      [Example Response]: 540.1 - Abscess of appendix
      47.01 - Lap appendectomy """),
      HumanMessage(content="Hi AI, how are you today?"),
      AIMessage(content="I'm great thank you. How can I help you?")
    ]
    #SystemMessage(content="You are a helpful assistant who will be coding for ICD-9 procedure and diagnosis codes given a patients clincial discharge notes.")
    # Convert query to a list
    query_list = [query]

    # Convert the query list to a string
    query_string = " ".join(query_list)
    augmented_prompt = augment_prompt(query_string)
    #print(augmented_prompt)
    messages.append(HumanMessage(content=augmented_prompt))
    #print(messages)

    res = chat(messages)
    print(res.content)


ask_and_augment()


Enter your query: admission date discharge date date of birth sex m service urology allergies patient recorded as having no known allergies to drugs attending first name3 lf chief complaint bladder ca major surgical or invasive procedure lap cystectomy open neobladder history of present illness 69m invasive bladder ca elected to proceed w lap cystectomy per dr last name stitle and open neobladder w dr first name stitle past medical history myocardial infarction yrs ago treated at hospital hospital patient reports receiving a clot busting medication tpa hospitalized x days and discharged did not take medications after discharge nor did he see a physician name initial pre no history of other hospitalizations or illnesses patient does report multiple abrasions minor lacerations etc that have occurred during his work as a self employed contractor social history married lives with wife in name ni pack year history of tobacco quit years ago does not and never has drunk alcohol confirmed with