In [None]:
! pip install anthropic
! pip install langchain

In [None]:
import boto3
from anthropic import Anthropic
from botocore.config import Config
import shutil
import os
import pandas as pd
import concurrent.futures
import pandas as pd
import json
import langchain

In [None]:
#Bedrock model ids
bedrock = boto3.client(service_name='bedrock',region_name='us-east-1')
[x['modelId'] for x in bedrock.list_foundation_models()['modelSummaries'] ]

In [None]:
# Create the bedrock runtime to invoke LLM
config = Config(
    read_timeout=120,
    retries = dict(
        max_attempts = 5 ## Handle retries
    )
)
import boto3
bedrock_runtime = boto3.client(service_name='bedrock-runtime',region_name='us-east-1',config=config)

## BULK DOC PROCESSING

In [None]:
!pip install pillow -U

### If you already have your files in s3 in pdf format, skip the next 4 cells

In [None]:
import shutil
import os
path='images' #local path to medical image files
shutil.rmtree(f'{path}/.ipynb_checkpoints',ignore_errors=True)
sources=os.listdir(path)
sources=[f"{path}/{x}" for x in sources]
sources.sort()

In [None]:
sources

In [None]:
# CONVERT MULTIPLE IMAGE FILES TO A PDF
from PIL import Image  
bucket_name="fairstone" #Bucket to upload pdf files to for textract
images = [
    Image.open(f) for f in sources
]
# local path to save image as pdf
pdf_path = "pdf_image/pdf1.pdf"
    
images[0].save(
    pdf_path, "PDF" ,resolution=100.0, save_all=True, append_images=images[1:]
)

In [None]:
## Upload file to s3
!aws s3 cp {pdf_path} s3://{bucket_name}/

In [None]:
# Asynchronous Textract call. 

TEXTRACT=boto3.client('textract')
response = TEXTRACT.start_document_analysis(
    DocumentLocation={
        'S3Object': {
           'Bucket': bucket_name, ## Bucket name holding the documents
            'Name': pdf_path.split('/',1)[-1], # path to the pdf files in s3
        }
    },
    FeatureTypes=[
        'LAYOUT',"TABLES" 
    ],
    ClientRequestToken='cdddssdatke',
    # JobTag='string',
    # NotificationChannel={
    #     'SNSTopicArn': 'string',
    #     'RoleArn': 'string'
    # },
    OutputConfig={
        'S3Bucket': bucket_name,
        'S3Prefix': 'textract_output/'
    },
    # KMSKeyId='string',
   
)
response

#### Helper Textract Functions

In [None]:
def merge_consecutive_runs_refactored(number_list):
    merged_list = [number_list[i] if i == 0 or number_list[i] != number_list[i-1] + 1 else None
                   for i in range(len(number_list))]

    return [num for num in merged_list if num is not None]

In [None]:
import os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint


def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    scores = []
    merged_cells = []
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'MERGED_CELL':
            merged_cells.extend(blocks_map[relationship['Ids'][0]]['Relationships'][0]['Ids'])
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}
                    
                    # get confidence score
                    scores.append(str(cell['Confidence']))
                        
                    # get the text value
                    rows[row_index][col_index] = {"text":get_text(cell, blocks_map),"ids":child_id}
                    # rows[row_index][col_index]={}
    return rows, scores, merged_cells


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        if "," in word['Text'] and word['Text'].replace(",", "").isnumeric():
                            text += '"' + word['Text'] + '"' + ' '
                        else:
                            text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] =='SELECTED':
                            text +=  'X '
    return text


def get_table_csv_results(response, blocks_map=None):
    # Get the text blocks
    blocks=response['Blocks']
    # pprint(blocks)
    if not blocks_map:
        blocks_map = {}
    table_blocks = []
    for block in blocks:
        if not blocks_map:
            blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)

    if len(table_blocks) <= 0:
        return "<b> NO Table FOUND </b>"
    
    rwss=[]
    merged_cellss=[]
    csv_text=[]
    csv = ''
    for index, table in enumerate(table_blocks):
        word, rws,merged_cells=generate_table_csv(table, blocks_map, index +1)
        csv += word
        csv += '\n\n'
        rwss.append(rws)
        merged_cellss.append(merged_cells)
        csv_text.append(csv)
        
    return csv_text, rwss, merged_cellss

def generate_table_csv(table_result, blocks_map, table_index):
    rows, scores, merged_cells = get_rows_columns_map(table_result, blocks_map)
    table_id = 'Table_' + str(table_index)
    
    # get cells.
    csv = 'Table: {0}\n\n'.format(table_id)

    for row_index, cols in rows.items():
        for col_index, text in cols.items():
            col_indices = len(cols.items())
            csv += '{}'.format(text['text']) + ","
        csv += '\n'
        
    csv += '\n\n Confidence Scores % (Table Cell) \n'
    cols_count = 0
    for score in scores:
        cols_count += 1
        csv += score + ","
        if cols_count == col_indices:
            csv += '\n'
            cols_count = 0

    csv += '\n\n\n'
    return csv, rows, merged_cells 

def csv_creator(file_name, blocks_map=None):
    result=get_table_csv_results(file_name, blocks_map)
    if "NO Table FOUND" in result:
        return ""
    else:
        table_csv, rows,merged_cells=result[0],result[1],result[2]
        page_list=[] 
        table_string=''

        table_list=[]
        for items in rows:
            table_dict={}
            for row_index, cols in items.items():
                table_dict[row_index]=[]
                for col_index, text in cols.items():    

                    col_indices = len(cols.items())
                    if text['ids'] in merged_cells:
                        table_string+=text["text"]
                        if table_dict[row_index]:
                            table_dict[row_index].pop(-1)
                        table_dict[row_index].append(table_string)
                    else:            
                        table_string += '{}'.format(text['text']) + ","
                        table_dict[row_index].append(text["text"])
                
                table_string += '\n'
            table_list.append(table_dict)
            cell_table=[x for x in [blocks_map[x] for x in blocks_map if blocks_map[x]['BlockType']=="TABLE"] if text['ids']in x['Relationships'][0]['Ids']]
            page=cell_table[0]['Page']
            page_list.append(page)
        import pandas as pd
        ids = []
        for items in rows:
            for v1 in items.values():
                for v2 in v1.values(): 
                    ids.append(v2['ids'])
        
        header=""
        page=""
        df_list=[]
        header_list=[]
  
        # Check if the first row's index size mismatches the rest of the rows
        for table_dict in table_list:

            if len(table_dict.keys())>1:
                if len(table_dict.get(1, [])) != len(table_dict.get(2, [])):
                    header=table_dict.pop(1)
                    columns = table_dict.pop(2)
                # Extract the column names from the first row
                else:
                    if any(table_dict[1]):
                        columns = table_dict.pop(1)
                    else:
                        columns = table_dict.pop(2)
           
                # Convert the dictionary to a DataFrame
                try:
                    df = pd.DataFrame.from_dict(table_dict, orient='index', columns=columns)
                    for page in set([blocks_map[x]["Page"] for x in ids]):
                        pages=page 
                except:
                    df = pd.DataFrame.from_dict({}, orient='index', columns=columns)
                    for page in set([blocks_map[x]["Page"] for x in ids]):
                        pages=page 
            else:
                columns = table_dict.pop(1)
                df = pd.DataFrame.from_dict({}, orient='index', columns=columns)
                for page in set([blocks_map[x]["Page"] for x in ids]):
                        pages=page 
            df_list.append(df)
            header_list.append(header) 
    
        return df_list, header_list, page_list,rows


In [None]:
import boto3

def extract_words_in_bounding_box(textract_response, holder):
    countt=0
    words_in_box = {}
    page=-1
    words=[]
    ground_truth_lines=[]
    ground_truth_doc={}
    lines=[]
    table_lines={}
    entire_doc_lines={}
    item_type=[]
    for item in holder:        
        target_page=item['Page']
        bounding_box=item['BoundingBox']
        layout_type=item['Type']
        item_type.append(layout_type)
        comptroller=1
        orchestrator=1
        g_orchestrator=1
        if page!= item['Page']:
            words=[]
            lines=[]
            ground_truth_lines=[]
        page=item['Page']
        # print(page)
        for block in textract_response['Blocks']:
            if  (
                block['BlockType'] == 'LINE' and
                'Page' in block and
                block['Page'] == target_page and
                'Geometry' in block 
            ):
                box = block['Geometry']['BoundingBox']
                # print(block['Page'])

                # Check if the word's bounding box is within the specified bounding box
                if (
                    bounding_box['Left']-0.015*bounding_box['Left'] <= box['Left'] and
                    bounding_box['Top']-0.015*bounding_box['Top'] <= box['Top'] and
                    (bounding_box['Left'] + bounding_box['Width'])+0.015*(bounding_box['Left'] + bounding_box['Width']) >= box['Left'] + box['Width'] and
                    (bounding_box['Top'] + bounding_box['Height'])+0.015*(bounding_box['Top'] + bounding_box['Height']) >= box['Top'] + box['Height']
                ):
                    # print(box['Left'],bounding_box['Left'])
                    # words_in_box.append(block['Text'])
                    lines.append(block['Text'])
                    entire_doc_lines[block['Page']]=lines
                    
                    if layout_type== 'LAYOUT_TABLE' and g_orchestrator==1:
                        ground_truth_lines.append("<table>")
                        g_orchestrator=0
                    if len(item_type)>1 and item_type[-1]!='LAYOUT_TABLE' and item_type[-2]=='LAYOUT_TABLE' and g_orchestrator==1:
                        ground_truth_lines.append("</table>")
                        g_orchestrator=0
                    ground_truth_lines.append(block['Text'])
                    ground_truth_doc[block['Page']]=ground_truth_lines
                    
                    
                if layout_type !='LAYOUT_TABLE':
                    box = block['Geometry']['BoundingBox']
                # print(block['Page'])

                # Check if the word's bounding box is within the specified bounding box
                    if (
                        bounding_box['Left']-0.015*bounding_box['Left'] <= box['Left'] and
                        bounding_box['Top']-0.015*bounding_box['Top'] <= box['Top'] and
                        (bounding_box['Left'] + bounding_box['Width'])+0.015*(bounding_box['Left'] + bounding_box['Width']) >= box['Left'] + box['Width'] and
                        (bounding_box['Top'] + bounding_box['Height'])+0.015*(bounding_box['Top'] + bounding_box['Height']) >= box['Top'] + box['Height']
                    ):
                        # print(box['Left'],bounding_box['Left'])
                        # words_in_box.append(block['Text'])
                        words.append(block['Text'])
                        words_in_box[block['Page']]=words
                if layout_type =='LAYOUT_TABLE':
                    # orchestrator=comptroller
                    if comptroller ==1:
                        countt+=1
                        words.append("<table>")
                        # words.append(table)
                        words_in_box[block['Page']]=words
                        comptroller=0
                    
                    box = block['Geometry']['BoundingBox']
                    # print(block['Page'])

                    # Check if the word's bounding box is within the specified bounding box
                    if (
                        bounding_box['Left']-0.015*bounding_box['Left'] <= box['Left'] and
                        bounding_box['Top']-0.015*bounding_box['Top'] <= box['Top'] and
                        (bounding_box['Left'] + bounding_box['Width'])+0.015*(bounding_box['Left'] + bounding_box['Width']) >= box['Left'] + box['Width'] and
                        (bounding_box['Top'] + bounding_box['Height'])+0.015*(bounding_box['Top'] + bounding_box['Height']) >= box['Top'] + box['Height']
                    ):
                        if block['Page'] in table_lines:
    #                     # # print(table_lines)
                            if orchestrator==1:
                                if not (block['Text'] in table_lines[block['Page']] or block['Text']+" xxxxxx" in table_lines[block['Page']]):
                                    table_lines[block['Page']].extend([block['Text']+" xxxxxx"])
                                    # print(block['Page'],[block['Text']+" xxxxxxORCH"])
                                    orchestrator=0
                            else:
                                # continue
                                if not block['Text'] in table_lines[block['Page']]:
                                    table_lines[block['Page']].extend([block['Text']])
                        else:
                        # continue
                            table_lines[block['Page']]=[block['Text']+" xxxxxx"]
                            # print(block['Page'],[block['Text']+" xxxxxx"])
                            orchestrator=0

    return words_in_box, table_lines,entire_doc_lines, ground_truth_doc#'\n'.join(words_in_box)


## EXTARCT DOCUMENT INTELLIGENTLY PRESERVING THE ORDER AND LAYOUT OF THE DOCUMENT CONTENT

In [None]:
# poll for response
import time
status=""
while status != 'SUCCEEDED':
    responses1 = TEXTRACT.get_document_analysis(
        JobId=response['JobId']

    )
    status=responses1['JobStatus']
    time.sleep(5)

In [None]:

responses1 = TEXTRACT.get_document_analysis(
    JobId=response['JobId'], 
)
geom=[]
blocks_map = {}
doc=[]
words_only=[]
table_holder={}
table_lines={}
non_layout_table_holder={}
ground_truth_doc={}
next_tokens=[]
complete_response_list=[]
for block in responses1['Blocks']:
    holder={}
    if block['BlockType'] in [
        # 'LAYOUT_FIGURE',
 # 'LAYOUT_FOOTER',
 'LAYOUT_HEADER',
 'LAYOUT_PAGE_NUMBER',
 'LAYOUT_SECTION_HEADER',
 'LAYOUT_TEXT',
 'LAYOUT_TITLE',
         'LAYOUT_TABLE',
       
                             ]:
        holder['BoundingBox']=block['Geometry']['BoundingBox']
        holder["Page"]=block['Page']
        holder["Type"]=block['BlockType']
        geom.append(holder)
 
    
    blocks_map[block['Id']] = block

        

if "NextToken" in responses1.keys():
    next_token=responses1['NextToken']
    while next_token:
        responses1 = TEXTRACT.get_document_analysis(
                    JobId=response['JobId'],
                    # MaxResults=123,
                    NextToken=next_token,
                    )
        for block in responses1['Blocks']:
            holder={}
            if block['BlockType'] in [
                    # 'LAYOUT_FIGURE',
                     # 'LAYOUT_FOOTER',
                     'LAYOUT_HEADER',
                     'LAYOUT_PAGE_NUMBER',
                     'LAYOUT_SECTION_HEADER',
                     'LAYOUT_TEXT',
                     'LAYOUT_TITLE',
               
                 'LAYOUT_TABLE',]:
                holder['BoundingBox']=block['Geometry']['BoundingBox']
                holder["Page"]=block['Page']
                holder["Type"]=block['BlockType']
                geom.append(holder)
            blocks_map[block['Id']] = block
        if "NextToken" in responses1.keys():
            next_token=responses1['NextToken']
        else:
            next_token=""
            break

responses1 = TEXTRACT.get_document_analysis(
    JobId=response['JobId'],
    # MaxResults=123,
    # NextToken=nt,
)
result = extract_words_in_bounding_box(responses1, geom)
doc.append(result[0])
table_lines.update(result[1])
non_layout_table_holder.update(result[2])
ground_truth_doc.update(result[-1])

table_result=csv_creator(responses1,blocks_map)
if table_result:
    table, header,pages=table_result[0], table_result[1],table_result[2]
    
    for ids,page in enumerate(pages):
        doc
        if page in table_holder:
            table_holder[page].extend([header[ids]]+[table[ids]])
        else:            
            dummy=[]
            dummy.extend([header[ids]]+[table[ids]])
            table_holder[int(page)]= dummy
if "NextToken" in responses1.keys():
    next_token=responses1['NextToken']           
    while next_token:
        responses1 = TEXTRACT.get_document_analysis(
            JobId=response['JobId'],
            # MaxResults=123,
            NextToken=next_token,
        )

        table_result=csv_creator(responses1,blocks_map)
        if table_result:
            table, header,pages=table_result[0], table_result[1],table_result[2]

            for ids,page in enumerate(pages):
                if page in table_holder:
                    table_holder[page].extend([header[ids]]+[table[ids]])
                else:      
                    dummy=[]
                    dummy.extend([header[ids]]+[table[ids]])
                    table_holder[int(page)]= dummy

          
        next_tokens.append(next_token)
        result = extract_words_in_bounding_box(responses1, geom)    
        doc.append(result[0])
        table_lines.update(result[1])
        non_layout_table_holder.update(result[2])   
        ground_truth_doc.update(result[-1])      
        if "NextToken" in responses1.keys():
            next_token=responses1['NextToken']
        else:
            next_token=""
            break

for d in doc:
    for k, v in d.items():
        layout_table_count=v.count('<table>')
        if k in table_holder.keys():
            non_layout_table_count=len(table_holder[k])/2 # Table_holder has header and tables in list 
        else:
            non_layout_table_count=0
        if layout_table_count == non_layout_table_count:
            continue
        elif non_layout_table_count==0:
            table_index_posts=[index for index, element in enumerate(v) if element == '<table>']
            merged_list_refactored = merge_consecutive_runs_refactored(table_index_posts)

            for ids in merged_list_refactored:
                #index of all tables in extracted text
                dynamic_table_index_posts=[index for index, element in enumerate(v) if element == '<table>']
                # handle consecutive tables series
                dynamic_merged_list_refactored= merge_consecutive_runs_refactored(dynamic_table_index_posts)
                table_index_pos=v.index("<table>")  
                if table_index_pos+1 in dynamic_table_index_posts: # if consecutive tables
                    if len(dynamic_merged_list_refactored)>1:
                        next_non_consecutive_item=dynamic_merged_list_refactored[dynamic_merged_list_refactored.index(table_index_pos)+1]
                        last_consecutive_series_item=dynamic_table_index_posts[dynamic_table_index_posts.index(next_non_consecutive_item)-1]
                    else:
                        last_consecutive_series_item=dynamic_table_index_posts[-1]

                    if dynamic_merged_list_refactored[0]!=len(v): #Check that table in text is not the last item
                        # index for the word trailing the table, handle for multiple occurence of that word- in the doc
                        index_stopper=[i for i in range(len(non_layout_table_holder[2]) - 1) if non_layout_table_holder[2][i] == v[last_consecutive_series_item+1] and non_layout_table_holder[2][i + 1] ==  v[last_consecutive_series_item+2]][0]

                        replace_text=non_layout_table_holder[k][table_index_pos-1:non_layout_table_holder[k].index(v[last_consecutive_series_item+1],index_stopper)][1:]
                        v[table_index_pos:last_consecutive_series_item+1]=replace_text
                    else:
                        replace_text=non_layout_table_holder[k][table_index_pos-1:][1:]
                        v[table_index_pos:last_consecutive_series_item+1]=replace_text
                else:  # No consecutive table series
                    replace_text=non_layout_table_holder[k][non_layout_table_holder[k].index(v[table_index_pos-1]):non_layout_table_holder[k].index(v[table_index_pos+1])][1:]
                    v[table_index_pos:table_index_pos+1]=replace_text
        elif non_layout_table_count>layout_table_count:
            non_layout_page_word_count=[]
            for i in range(layout_table_count*2):
                non_layout_page_word_count.extend(table_holder[k][i])
            length_of_non_layout_page=len(" ".join(non_layout_page_word_count).split())+len(" ".join(v).split())-layout_table_count
            length_of_layout_page=len(" ".join(non_layout_table_holder[k]).split())
            if length_of_layout_page>length_of_non_layout_page and (abs(length_of_layout_page-length_of_non_layout_page)/max(length_of_layout_page,length_of_non_layout_page))>0.2:
                v.insert(v.index("<table>")+1,"<table>")

table_count_list={}
for d in doc:
    for k, v in d.items():
        table_count_list[k]=v.count('<table>')


table_count_lists = []
for k, v in table_count_list.items():
    if v != 0:
        table_count_lists.extend([k]*v)

doc_layout_extract={}  
if not len(table_count_lists) or  not len([x["Page"] for x in  geom if x['Type']=="LAYOUT_TABLE"]):
    print("NO TABLES FOUND")
    for item in doc:
        for k, v in item.items():
            doc_layout_extract[k]=v
else:
    for d in doc:
        for k, v in d.items():
            if v.count("<table>")>1:
                txt=v
                ids=0           
                for count in range(v.count("<table>")):                              
                    header_and_table=f"{table_holder[k][ids]}\n<tables>{table_holder[k][ids+1].to_csv(index=False, sep='|')}</tables>"
                    if count+1<= table_lines[k].count("xxxxxx"):
                        header_before_table=[x for x in table_lines[k] if "xxxxxx" in x][count].split("xxxxxx")[0]
                        if not header_before_table in header_and_table:
                            header_and_table=f"{header_before_table}\n{header_and_table}"
                    
                    table_holder_index=txt.index("<table>")
                    txt[table_holder_index]=header_and_table
                    ids+=2
                doc_layout_extract[k]=txt
            elif v.count("<table>")==1:
                
                txt=v
                header_and_table=f"{table_holder[k][0]}\n<tables>{table_holder[k][1].to_csv(index=False, sep='|')}</tables>"
                header_before_table=[x for x in table_lines[k] if "xxxxxx" in x][0].split("xxxxxx")[0]
                if not header_before_table in header_and_table:
                    header_and_table=f"{header_before_table}\n{header_and_table}"
                table_holder_index=txt.index("<table>")
                txt[table_holder_index]=header_and_table
                
                doc_layout_extract[k]=txt
            else:
                doc_layout_extract[k]=v
    for k, v in ground_truth_doc.items():
        # table_idx=0
        page_table_idx=[doc_layout_extract[k].index(x) for x in doc_layout_extract[k] if "<tables>" in x]
        ground_truth_table_ids=[ground_truth_doc[k].index(x) for x in ground_truth_doc[k] if "<table>" in x]
        if page_table_idx:
            for table_count,table_idx in enumerate(ground_truth_table_ids):
                page_tab=doc_layout_extract[k][page_table_idx[table_count]]
                page_tab_to_list=page_tab.split("\n")
                ground_truth_table_header=ground_truth_doc[k][table_idx+1]
                page_tab_headers=page_tab_to_list[0] if page_tab_to_list[0] else page_tab_to_list[1]
                if ground_truth_table_header:
                    found = ground_truth_table_header in page_tab_headers #any(result_table_header in item for item in tab_headers)
                    if found:
                        continue
                    else:
                        doc_layout_extract[k][page_table_idx[table_count]]=f"{ground_truth_table_header}\n"+page_tab
                else:
                    continue


In [None]:
## Extracted document content into a dictionary with key as page number and value as page content read by line 
doc_layout_extract

In [None]:
### Merge the lines of of the extracted pdf dictionary to a single coherent text
document_text=""
for k, v in doc_layout_extract.items():
    document_text+="\n".join(v)
    document_text+="\n\n"
print(document_text)

## Different Prompt Template

#### Summarization prompt templae

In [None]:
prompt2=f"""\n\nHuman:
You are a tenured medical doctor.

Here is a patient's medical documents below:
<medical_documents>
{document_text}
</medical_documents>

Read through the medical documents and understand the patient's record. Think about all the medical information captured within the documents including any:
- medications,
- dates and places,
- injury/illness,
- comorbidities,
- findings,
- treatments,
- diagnosis.
Put your thoughts in <thinking> xml tags.

After reading and thinking through the document, generate a comprehensive medical summary of the patient that captures all medical information including:
- medications,
- dates and places,
- injury/illness: how/when and where it occured,,
- past comorbidities and impact on any current injury,
- subjective findings: patient reported symptoms, feelings pain level etc.,
- objective findings: pysician observed findings,
- treatments: current and past,
- diagnosis: diagnosis codes with their full descriptions and explanation.

Your summary should be sectioned by each topics aligned above.\n\nAssistant:"""

In [None]:
from langchain.llms.bedrock import Bedrock
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
inference_modifier = {'max_tokens_to_sample':1000, 
                      "temperature":0.1,
                      # "top_k":250,
                      # "top_p":1,               
                     }
llm = Bedrock(model_id='anthropic.claude-v2', client=bedrock_runtime, model_kwargs = inference_modifier,
              streaming=True,  # Toggle this to turn streaming on or off
              callbacks=[StreamingStdOutCallbackHandler() ])

response = llm(prompt2)
if '<thinking>' in response:
    idx1 = response.index('<thinking>')
    idx2 = response.index('</thinking>')
    thought_step=response[idx1 + len('<thinking>') + 1: idx2]
    response=response[idx2 + len('</thinking>') + 1:]


#### Q&A prompt template

In [None]:
question="What date did the patient visit the clinic?"

In [None]:
prompt3=f"""\n\nHuman:
You are a tenured medical doctor. You will be providing factual answers about a patient based on their medical document.

Here is a patient's medical documents below:
<medical_documents>
{document_text}
</medical_documents>

After reading through the document, provide an answer to a user question below:
{question}
\n\nAssistant:"""

In [None]:
from langchain.llms.bedrock import Bedrock
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
inference_modifier = {'max_tokens_to_sample':500, 
                      "temperature":0.1,
                      # "top_k":250,
                      # "top_p":1,               
                     }
llm = Bedrock(model_id='anthropic.claude-v2', client=bedrock_runtime, model_kwargs = inference_modifier,
              streaming=True,  # Toggle this to turn streaming on or off
              callbacks=[StreamingStdOutCallbackHandler() ])

response = llm(prompt3)
if '<thinking>' in response:
    idx1 = response.index('<thinking>')
    idx2 = response.index('</thinking>')
    thought_step=response[idx1 + len('<thinking>') + 1: idx2]
    response=response[idx2 + len('</thinking>') + 1:]
