# Building LLM-based applications for writting discharge summary

In this notebook, we will build an application to writting discharge summary.

For required packages, please run:

```bash
poetry install
```

There are a few methods explored in this notebook:
1. Stuff with human and system prompt
2. Stuff with human prompt
3. Decompose method
4. MapReduce method
5. Refine method

In [None]:
import os
from typing import Literal
import sys
from pathlib import Path
from dataclasses import dataclass

import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown
from langchain_openai import AzureChatOpenAI
import tiktoken

nest_asyncio.apply()

load_dotenv()

@dataclass
class ModelDefinition:
    deployment: str
    name: str
    version: str


AvailableModels = Literal['gpt-35', 'gpt-4o-mini', 'gpt-4-turbo']

available_models: dict[AvailableModels, ModelDefinition] = {
    "gpt-35":
    ModelDefinition(
        deployment="gpt_35_16k",
        name="gpt-35-turbo-16k",
        version="0613",
    ),
    "gpt-4o-mini":
    ModelDefinition(
        deployment="gpt-4o-mini",
        name="gpt-4o-mini",
        version="2024-07-18",
    ),
    "gpt-4-turbo":
    ModelDefinition(
        deployment="gpt-4-turbo",
        name="gpt-4",
        version="turbo-2024-04-09",
    ),
}

def init_model(model_name: AvailableModels, temperature: float):
    model = available_models.get(model_name)
    if not model:
        raise ValueError(f"Model {model_name} not found")
    return AzureChatOpenAI(
        deployment_name=model.deployment,
        model_name=model.name,
        temperature=temperature,
    )

def count_tokens(text: str, model_name: AvailableModels) -> int:
    """Count the number of tokens in the text for a specific model."""

    # Map models to known tokenizer encodings
    model_to_encoding = {
        "gpt-35-turbo-16k": "cl100k_base",
        "gpt-4o-mini": "o200k_base",
        "gpt-4": "cl100k_base"
    }
    encoding_name = model_to_encoding.get(model_name)
    if not encoding_name:
        raise ValueError(f"Encoding not found for model {model_name}")
    encoding = tiktoken.get_encoding(encoding_name)

    tokens = encoding.encode(text)
    return len(tokens)

In [None]:
llm = init_model("gpt-35", temperature=0.0)

# Add notebooks parent directory to path
project_folder = Path(os.getcwd()).parent
sys.path.append(str(project_folder))
output_folder = project_folder / "data" / "output"
case_dir = project_folder / "data" / 'processed'
if not case_dir.exists():
    raise FileNotFoundError(f"Cases directory not found at {case_dir}")
if not output_folder.exists():
    output_folder.mkdir(parents=True)

In [None]:
# Local files
from helpers import read_json_file, read_markdown_file

# Read cases

In [None]:
# Filter those that are _original.md - ignore other languages for now
file_list = [
    f for f in (case_dir / 'merged').glob('*.md')
    if f.stem.endswith('_original') and f.stem.find('Orthopaedic') != -1
]
work_with_file = file_list[0]

case = read_markdown_file(path=work_with_file)

json_file_list = [f for f in (case_dir / 'markdown').glob('*.json')
                  if f.stem.endswith('_original') and
                     f.stem.find('Orthopaedic') != -1]

# Read and parse the json file as a dictionary
json_file = json_file_list[0]

case_object = read_json_file(json_file)

In [None]:
count_tokens(case, model_name=llm.model_name)

# Basic method (Naive)

In [None]:
from basic.basic import basic_chain

out = basic_chain(llm=llm).dual.invoke({"notes": case})
Markdown(out)

# Decompose Technique

In [None]:
from decompose import single_decompose

out = single_decompose(case=case_object, llm=llm)
Markdown(out)

# Map Reduce Prompt Techniques

In [None]:
from map_reduce import create_mapreduce_df

mapreduce_sum = create_mapreduce_df(case=case_object, llm=llm, n=2)
mapreduce_sum.to_csv(output_folder / "mapreduce_final.csv")

# Refine Prompt Techniques

In [None]:
from refine import create_refine_df

refine_sum = create_refine_df(case=case_object, llm=llm, n=2)
refine_sum.to_csv(output_folder / "refine_final.csv")