In [1]:
!pip install pymupdf google-generativeai




In [32]:
global GOOGLE_API_KEY
GOOGLE_API_KEY = ""  # Replace with your Gemini API key


# Final Approach

## Direct prompt

In [22]:
import fitz  # PyMuPDF for PDF text extraction
import google.generativeai as genai  # Google Gemini API
import json

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    print(text)
    return text

# Step 2: Configure Google Gemini API
def setup_gemini(api_key):
    """Configures the Google Gemini API."""
    genai.configure(api_key=api_key)

# Step 3: Send Text to Google Gemini for FHIR Conversion
def generate_fhir_bundle(prompt):
    """Sends extracted text to Google Gemini AI for FHIR Bundle JSON conversion."""
    model = genai.GenerativeModel("gemini-1.5-pro")  # Using the latest Gemini model
    response = model.generate_content(prompt)
    print(response.text)
    return response.text  # Extract generated JSON

# Step 4: Save JSON to a File
def save_json(output_json, filename="fhir_bundle.json"):
    """Saves the JSON output to a file."""
    with open(filename, "w", encoding="utf-8") as f:
        f.write(output_json)
    print(f"FHIR Bundle saved as {filename}")

import re
import json

def extract_fhir_json(response_text):
    """
    Extracts only the valid JSON content from a response.

    Parameters:
    - response_text (str): The full response containing JSON and extra text.

    Returns:
    - dict: Extracted JSON data if valid, otherwise raises an error.
    """
    # Use regex to find the first occurrence of a valid JSON block
    json_match = re.search(r'\{.*\}', response_text, re.DOTALL)

    if json_match:
        json_str = json_match.group(0)  # Extract the matched JSON content
        try:
            fhir_json = json.loads(json_str)  # Validate and parse JSON
            return fhir_json
        except json.JSONDecodeError as e:
            raise ValueError(f"Error decoding JSON: {e}")
    else:
        raise ValueError("No valid JSON found in the response.")



In [23]:

# Main Execution
if __name__ == "__main__":
    # 🔹 Set your Google API key here

    pdf_path = "/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids.pdf"
    setup_gemini(GOOGLE_API_KEY)


    extracted_text = extract_text_from_pdf(pdf_path)

    prompt = f"""
Convert the following unstructured medical discharge summary into a structured FHIR Bundle JSON.

### **Instructions:**
1. Ensure the response is in **valid JSON format only**. Do not include explanations, comments, or any text outside the JSON structure.
2. Use correct **FHIR resources**, including:
   - **Patient** (Demographics, MRN, Gender, Birthdate)
   - **Condition** (Primary & Secondary Diagnoses)
   - **Procedure** (All procedures performed)
   - **Observation** (Vital signs & Lab results)
   - **MedicationStatement** (Prescribed medications at discharge)
3. Follow the **FHIR R4 standard** and include proper SNOMED CT/LOINC codes where applicable.
4. Ensure all references (e.g., `subject.reference` fields) correctly link to the **Patient resource**.

Discharge Summary:
{extracted_text}

### **Output:**
Return **only** a valid JSON structure. The response must start with `{{` and end with `}}`. Do not include any additional text.
    """


    response = generate_fhir_bundle(prompt)
    print(response)
    json_res = extract_fhir_json(response)
    json_res = json.dumps(json_res, indent=4) # Pretty-print the extracted JSON

    save_json(json_res, "/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids_output.json")


DISCHARGE SUMMARY
Patient Details:
Name: John Doe
Date of Birth: 15 March 1985
Gender: Male
Medical Record Number: 123456789
Admission Date: 01 February 2025
Discharge Date: 08 February 2025
Attending Physician: Dr. Emily Carter, MD
Consultant: Dr. Robert Sinclair, MD (Cardiology)
--------------------------------------------------
Primary Diagnosis:
- Acute Myocardial Infarction (ST-Elevation)
Secondary Diagnoses:
- Hypertension
- Type 2 Diabetes Mellitus
- Hyperlipidemia
- Obesity (BMI: 32)
Procedures Performed:
- Coronary Angiography (02 February 2025)
- Percutaneous Coronary Intervention (PCI) with Drug-Eluting Stent 
Placement (03 February 2025)
--------------------------------------------------
Clinical Course:
Mr. John Doe was admitted to the hospital on 01 February 2025 via 
the emergency department after experiencing severe chest pain 
radiating to his left arm. Initial ECG showed ST-elevation in leads 
II, III, and aVF, suggestive of an acute inferior wall myocardial 
infarcti

### Evaluation

In [24]:
# Function to load JSON files
def load_json(file_path):
      """Loads a JSON file."""
      with open(file_path, "r", encoding="utf-8") as file:
          return json.load(file)

# Main Execution
if __name__ == "__main__":
    # 🔹 Set your Google API key here

    pdf_path = "/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids.pdf"
    setup_gemini(GOOGLE_API_KEY)

    json_1 = load_json("/content/drive/MyDrive/canada company assignment/bundle/bundle-example.json")
    json_2 = load_json("/content/drive/MyDrive/canada company assignment/bundle/bundle-example_output.json")


    prompt = f"""
    You are an expert in JSON structure analysis. Your task is to compare two JSON documents and determine how similar they are.

### **Instructions:**
1. **Compare both JSONs field by field**, considering:
   - Missing or extra fields.
   - Differences in values (including numbers, strings, and lists).
   - Structural differences (nested objects, ordering).

2. **Generate a similarity score (0% to 100%)** based on:
   - **Matching fields and values**: Higher score.
   - **Minor format differences (case sensitivity, spacing, ordering)**: Ignore if the meaning is unchanged.
   - **Significant differences in keys or values**: Lower score.

3. **Provide an explanation** of what is different:
   - List fields that are **identical**.
   - Highlight fields that have **small differences** (e.g., `"value": "120"` vs `"value": "125"`).
   - Show **missing fields** in one JSON but present in the other.
   - Highlight incorrect or extra information.

### **Example Output:**
✅ **Similarity Score: 92%**
🔹 **Identical Fields:** 45/50 match
⚠️ **Minor Differences:** `patient.name = "John Doe" vs "Doe, John"`
❌ **Missing Fields:** `"followUpDate"` is missing in the generated JSON.

Now, compare the following two JSONs and provide a similarity score along with a summary of differences.

### **JSON 1:**
{json_1}

### **JSON 2:**
{json_2}

    """

    response = generate_fhir_bundle(prompt)
    print(response)

✅ **Similarity Score: 25%**

The two JSON documents share some basic resource types like `MedicationRequest` and the patient ID `f001`, but their overall structure and purpose are very different.  JSON 1 appears to be a search result bundle for `MedicationRequests` related to a patient, while JSON 2 seems to be a collection of resources related to a patient's condition, including the patient themselves, a condition, a medication statement, and a medication request. Therefore, despite a few overlapping elements, they are dissimilar.

🔹 **Identical Fields (with caveats):**

* `resourceType: Bundle` (Both are bundles, but of different types)
*  `entry[0].resource.id: 3123` (The MedicationRequest ID is the same, but other fields within the resource are different.)
* `entry[...].resource.resourceType: MedicationRequest` (Present in both, but with different details and context.)

⚠️ **Minor Differences:**

* `entry[...].resource.status: unknown` (Present in both MedicationRequests, but the o

## Custom data RAG Based approach


In [26]:

# Main Execution
if __name__ == "__main__":
    # 🔹 Set your Google API key here

    pdf_path = "/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids.pdf"
    setup_gemini(GOOGLE_API_KEY)

    # Read FHIR JSON Representation file
    with open("/content/drive/MyDrive/canada company assignment/FHIR_JSON_Representation.txt", "r", encoding="utf-8") as f:
        fhir_representation = f.read()

    extracted_text = extract_text_from_pdf(pdf_path)


    prompt_template = f"""
    Convert the following unstructured medical discharge summary into a structured FHIR Bundle JSON.
    Ensure the FHIR resources include:
    - Patient
    - Condition (Primary and Secondary Diagnoses)
    - Procedure (Procedures performed)
    - Observation (Vital signs and Lab results)
    - MedicationStatement (Medications prescribed)

    The description of representation of FHIR JSON is as follows:
    {fhir_representation}

    Discharge Summary:
    {extracted_text}

    Output only the valid FHIR Bundle JSON with no additional explanations:
    """

    print(prompt_template)


    try:
        response = generate_fhir_bundle(prompt_template)
        json_res = extract_fhir_json(response)
        json_res = json.dumps(json_res, indent=4) # Pretty-print the extracted JSON
        save_json(json_res,"/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids_output_RAG.json")
    except ValueError as e:
        print(f"Error: {e}")

DISCHARGE SUMMARY
Patient Details:
Name: John Doe
Date of Birth: 15 March 1985
Gender: Male
Medical Record Number: 123456789
Admission Date: 01 February 2025
Discharge Date: 08 February 2025
Attending Physician: Dr. Emily Carter, MD
Consultant: Dr. Robert Sinclair, MD (Cardiology)
--------------------------------------------------
Primary Diagnosis:
- Acute Myocardial Infarction (ST-Elevation)
Secondary Diagnoses:
- Hypertension
- Type 2 Diabetes Mellitus
- Hyperlipidemia
- Obesity (BMI: 32)
Procedures Performed:
- Coronary Angiography (02 February 2025)
- Percutaneous Coronary Intervention (PCI) with Drug-Eluting Stent 
Placement (03 February 2025)
--------------------------------------------------
Clinical Course:
Mr. John Doe was admitted to the hospital on 01 February 2025 via 
the emergency department after experiencing severe chest pain 
radiating to his left arm. Initial ECG showed ST-elevation in leads 
II, III, and aVF, suggestive of an acute inferior wall myocardial 
infarcti

### Evaluation

In [27]:
# Function to load JSON files
def load_json(file_path):
      """Loads a JSON file."""
      with open(file_path, "r", encoding="utf-8") as file:
          return json.load(file)

# Main Execution
if __name__ == "__main__":
    # 🔹 Set your Google API key here

    pdf_path = "/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids.pdf"
    setup_gemini(GOOGLE_API_KEY)

    json_1 = load_json("/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids.json")
    json_2 = load_json("/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids_output.json")


    prompt = f"""
    You are an expert in JSON structure analysis. Your task is to compare two JSON documents and determine how similar they are.

### **Instructions:**
1. **Compare both JSONs field by field**, considering:
   - Missing or extra fields.
   - Differences in values (including numbers, strings, and lists).
   - Structural differences (nested objects, ordering).

2. **Generate a similarity score (0% to 100%)** based on:
   - **Matching fields and values**: Higher score.
   - **Minor format differences (case sensitivity, spacing, ordering)**: Ignore if the meaning is unchanged.
   - **Significant differences in keys or values**: Lower score.

3. **Provide an explanation** of what is different:
   - List fields that are **identical**.
   - Highlight fields that have **small differences** (e.g., `"value": "120"` vs `"value": "125"`).
   - Show **missing fields** in one JSON but present in the other.
   - Highlight incorrect or extra information.

### **Example Output:**
✅ **Similarity Score: 92%**
🔹 **Identical Fields:** 45/50 match
⚠️ **Minor Differences:** `patient.name = "John Doe" vs "Doe, John"`
❌ **Missing Fields:** `"followUpDate"` is missing in the generated JSON.

Now, compare the following two JSONs and provide a similarity score along with a summary of differences.

### **JSON 1:**
{json_1}

### **JSON 2:**
{json_2}

    """

    response = generate_fhir_bundle(prompt)
    print(response)

✅ **Similarity Score: 7%**

🔹 **Identical Fields:** `resourceType`

⚠️ **Minor Differences:** None

❌ **Missing/Extra Fields & Structural Differences:**  Significant structural differences between the JSONs render a detailed field-by-field comparison impractical. Here's a summary of the key disparities:

* **Different `type`:** JSON 1 is a `collection` of observations (lab results), while JSON 2 is a `document` containing patient demographics, conditions, and procedures.
* **Different `entry` contents:**
    * **JSON 1's `entry` array** contains resources related to lab reports (`DiagnosticReport` and `Observation`), including detailed result values, references to patients, and performers.
    * **JSON 2's `entry` array** contains resources related to a patient's medical history: `Patient`, `Condition`, and `Procedure`.  There are no lab values present.
* **Different focus:** JSON 1 centers on lab results for a specific patient, while JSON 2 focuses on the patient's overall medical his

In [31]:
import json
from deepdiff import DeepDiff

def load_json(file_path):
    """Loads a JSON file."""
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

def evaluate_json_similarity(generated_json_path, ground_truth_path):
    """Compares generated FHIR JSON with ground truth and calculates accuracy."""
    generated_json = load_json(generated_json_path)
    ground_truth_json = load_json(ground_truth_path)

    # Compare JSONs
    differences = DeepDiff(ground_truth_json, generated_json, ignore_order=True, report_repetition=True)

    # Convert DeepDiff object to a normal dictionary
    differences_dict = json.loads(differences.to_json()) if differences else {}

    # Count total fields
    total_fields = len(json.dumps(ground_truth_json))
    mismatched_fields = len(json.dumps(differences_dict)) if differences_dict else 0
    accuracy = 1 - (mismatched_fields / max(total_fields, 1))

    print("\n🔹 Differences Found:")
    if differences_dict:
        print(json.dumps(differences_dict, indent=4))
    else:
        print("✅ No differences found. JSONs match exactly!")

    print(f"\n✅ JSON Similarity Accuracy: {accuracy * 100:.2f}%")

# Run evaluation
evaluate_json_similarity("/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids_output_RAG.json", "/content/drive/MyDrive/canada company assignment/bundle/bundle-lipids.json")



🔹 Differences Found:
{
    "dictionary_item_removed": [
        "root['id']",
        "root['meta']"
    ],
    "values_changed": {
        "root['type']": {
            "new_value": "document",
            "old_value": "collection"
        }
    },
    "iterable_item_added": {
        "root['entry'][0]": {
            "resource": {
                "resourceType": "Patient",
                "identifier": [
                    {
                        "system": "http://example.org/mrn",
                        "value": "123456789"
                    }
                ],
                "name": [
                    {
                        "use": "official",
                        "family": "Doe",
                        "given": [
                            "John"
                        ]
                    }
                ],
                "gender": "male",
                "birthDate": "1985-03-15"
            }
        },
        "root['entry'][1]": {
            "resource

# Experiment - Don't look at this code

In [3]:
import fitz  # PyMuPDF for PDF text extraction
import google.generativeai as genai  # Google Gemini API
import json

# Step 1: Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

# Step 2: Configure Google Gemini API
def setup_gemini(api_key):
    """Configures the Google Gemini API."""
    genai.configure(api_key=api_key)

# Step 3: Send Text to Google Gemini for FHIR Conversion
def generate_fhir_bundle(prompt):
    """Sends extracted text to Google Gemini AI for FHIR Bundle JSON conversion."""
    model = genai.GenerativeModel("gemini-1.5-pro")  # Using the latest Gemini model
    response = model.generate_content(prompt)
    print(response.text)
    return response.text  # Extract generated JSON

# Step 4: Save JSON to a File
def save_json(output_json, filename="fhir_bundle.json"):
    """Saves the JSON output to a file."""
    with open(filename, "w", encoding="utf-8") as f:
        f.write(output_json)
    print(f"FHIR Bundle saved as {filename}")

import json
import re

def extract_fhir_json(response_text):
    """
    Extracts only the valid JSON content from a response.

    Parameters:
    - response_text (str): The full response containing JSON and extra text.

    Returns:
    - dict: Extracted JSON data if valid, otherwise raises an error.
    """
    # Use regex to find the first occurrence of a valid JSON block
    json_match = re.search(r'\{.*\}', response_text, re.DOTALL)

    if json_match:
        json_str = json_match.group(0)  # Extract the matched JSON content
        try:
            fhir_json = json.loads(json_str)  # Validate and parse JSON
            return fhir_json
        except json.JSONDecodeError as e:
            raise ValueError(f"Error decoding JSON: {e}")
    else:
        raise ValueError("No valid JSON found in the response.")


In [28]:

# Main Execution
if __name__ == "__main__":
    # 🔹 Set your Google API key here

    pdf_path = "Discharge_Summary_John_Doe_new.pdf"
    setup_gemini(GOOGLE_API_KEY)


    extracted_text = extract_text_from_pdf(pdf_path)

    prompt = f"""
    Convert the following unstructured medical discharge summary into a structured FHIR Bundle JSON.
    Ensure the FHIR resources include:
    - Patient
    - Condition (Primary and Secondary Diagnoses)
    - Procedure (Procedures performed)
    - Observation (Vital signs and Lab results)
    - MedicationStatement (Medications prescribed)

    Discharge Summary:
    {extracted_text}

    Output the FHIR Bundle JSON:
    """

    fhir_bundle_json = generate_fhir_bundle(prompt)
    save_json(fhir_bundle_json)


DISCHARGE SUMMARY
Patient Details:
Name: John Doe
Date of Birth: 15 March 1985
Gender: Male
Medical Record Number: 123456789
Admission Date: 01 February 2025
Discharge Date: 08 February 2025
Attending Physician: Dr. Emily Carter, MD
Consultant: Dr. Robert Sinclair, MD (Cardiology)
--------------------------------------------------
Primary Diagnosis:
- Acute Myocardial Infarction (ST-Elevation)
Secondary Diagnoses:
- Hypertension
- Type 2 Diabetes Mellitus
- Hyperlipidemia
- Obesity (BMI: 32)
Procedures Performed:
- Coronary Angiography (02 February 2025)
- Percutaneous Coronary Intervention (PCI) with Drug-Eluting Stent Placement (03 February 2025)
--------------------------------------------------
Clinical Course:

Mr. John Doe was admitted to the hospital on 01 February 2025 via the emergency department after
experiencing severe chest pain 
radiating to his left arm. Initial ECG showed ST-elevation in leads II, III, and aVF, suggestive of an
acute inferior wall myocardial infarction.

In [103]:



# Main Execution
if __name__ == "__main__":
    # 🔹 Set your Google API key here

    pdf_path = "/content/drive/MyDrive/canada company assignment/bundle/bundle-example.pdf"
    setup_gemini(GOOGLE_API_KEY)

    # Read FHIR JSON Representation file
    with open("/content/drive/MyDrive/canada company assignment/FHIR_JSON_Representation.txt", "r", encoding="utf-8") as f:
        fhir_representation = f.read()

    extracted_text = extract_text_from_pdf(pdf_path)


    prompt_template = f"""
    Convert the following unstructured medical discharge summary into a structured FHIR Bundle JSON.
    Ensure the FHIR resources include:
    - Patient
    - Condition (Primary and Secondary Diagnoses)
    - Procedure (Procedures performed)
    - Observation (Vital signs and Lab results)
    - MedicationStatement (Medications prescribed)

    The description of representation of FHIR JSON is as follows:
    {fhir_representation}

    Discharge Summary:
    {extracted_text}

    Output only the valid FHIR Bundle JSON with no additional explanations:
    """

    print(prompt_template)


    try:
        response = generate_fhir_bundle(prompt_template)
        json_res = extract_fhir_json(response)
        json_res = json.dumps(json_res, indent=4) # Pretty-print the extracted JSON
        save_json(fhir_bundle_json,"/content/drive/MyDrive/canada company assignment/bundle/bundle-example_output_improved.json")
    except ValueError as e:
        print(f"Error: {e}")





    Convert the following unstructured medical discharge summary into a structured FHIR Bundle JSON.
    Ensure the FHIR resources include:
    - Patient
    - Condition (Primary and Secondary Diagnoses)
    - Procedure (Procedures performed)
    - Observation (Vital signs and Lab results)
    - MedicationStatement (Medications prescribed)

    The description of representation of FHIR JSON is as follows:
    FHIR JSON Representation Documentation

1. Introduction
The JSON format for representing healthcare resources follows the RFC 8259 (STD 90) standard and is used in FHIR (Fast Healthcare Interoperability Resources). This ensures structured and standardized healthcare data exchange.

2. Basic Structure of a FHIR JSON Resource
Each FHIR resource follows this structure:
{
  "resourceType": "[Resource Type]",
  "property1": "<[primitive]>",
  "property2": { [Datatype] },
  "property3": { 
    "propertyA": { CodeableConcept }
  },
  "property4": [{ 
    "propertyB": { Reference(Resourc

KeyboardInterrupt: 

In [None]:
#convert using python
import fitz  # PyMuPDF for PDF text extraction
import json
import re
from datetime import datetime

def extract_text_from_pdf(pdf_path):
    """Extracts text from the PDF file."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

def parse_discharge_summary(text):
    """Extracts structured data from the discharge summary text."""
    data = {}

    # Extract patient details
    data["name"] = re.search(r"Name:\s*(.+)", text).group(1)
    data["dob"] = re.search(r"Date of Birth:\s*(.+)", text).group(1)
    data["gender"] = re.search(r"Gender:\s*(.+)", text).group(1)
    data["mrn"] = re.search(r"Medical Record Number:\s*(\d+)", text).group(1)
    data["admission_date"] = re.search(r"Admission Date:\s*(.+)", text).group(1)
    data["discharge_date"] = re.search(r"Discharge Date:\s*(.+)", text).group(1)

    # Extract physician details
    data["attending_physician"] = re.search(r"Attending Physician:\s*(.+)", text).group(1)
    data["consultant"] = re.search(r"Consultant:\s*(.+)", text).group(1)

    # Extract diagnoses
    data["primary_diagnosis"] = re.findall(r"Primary Diagnosis:\s*-\s*(.+)", text)
    data["secondary_diagnoses"] = re.findall(r"Secondary Diagnoses:\s*-\s*(.+)", text)

    # Extract procedures
    data["procedures"] = re.findall(r"Procedures Performed:\s*-\s*(.+)", text)

    # Extract vital signs (Admission & Discharge)
    data["vitals_admission"] = {
        "bp": re.search(r"Blood Pressure:\s*([\d/]+)", text).group(1),
        "hr": re.search(r"Heart Rate:\s*(\d+)", text).group(1),
        "rr": re.search(r"Respiratory Rate:\s*(\d+)", text).group(1),
        "temp": re.search(r"Temperature:\s*([\d.]+)", text).group(1),
        "o2sat": re.search(r"Oxygen Saturation:\s*(\d+)%", text).group(1)
    }

    # Extract laboratory results
    data["lab_results"] = {
        "troponin_initial": re.search(r"Troponin I \(Initial\):\s*([\d.]+)", text).group(1),
        "troponin_discharge": re.search(r"Troponin I \(Discharge\):\s*([\d.]+)", text).group(1),
        "cholesterol": re.search(r"Total Cholesterol:\s*([\d.]+)", text).group(1),
        "ldl": re.search(r"LDL:\s*([\d.]+)", text).group(1),
        "hdl": re.search(r"HDL:\s*([\d.]+)", text).group(1),
        "hba1c": re.search(r"HbA1c:\s*([\d.]+)", text).group(1),
        "creatinine": re.search(r"Serum Creatinine:\s*([\d.]+)", text).group(1)
    }

    # Extract medications
    data["medications"] = re.findall(r"\d+\.\s*(.+)", text.split("Medications on Discharge:")[1].split("Discharge Instructions:")[0])

    # Extract discharge instructions
    data["discharge_instructions"] = re.findall(r"-\s*(.+)", text.split("Discharge Instructions:")[1].split("Follow-up Appointments:")[0])

    # Extract follow-up appointments
    data["follow_up_appointments"] = re.findall(r"\d+\.\s*(.+)", text.split("Follow-up Appointments:")[1].split("Discharging Physician:")[0])

    return data

def generate_fhir_json(data):
    """Creates a FHIR JSON Bundle from extracted data."""
    fhir_bundle = {
        "resourceType": "Bundle",
        "type": "document",
        "entry": [
            {
                "fullUrl": "urn:uuid:patient-1",
                "resource": {
                    "resourceType": "Patient",
                    "id": "patient-1",
                    "identifier": [
                        {
                            "system": "http://hospital.example.org/mrn",
                            "value": data["mrn"]
                        }
                    ],
                    "name": [
                        {
                            "family": data["name"].split()[-1],
                            "given": data["name"].split()[:-1]
                        }
                    ],
                    "gender": data["gender"].lower(),
                    "birthDate": datetime.strptime(data["dob"], "%d %B %Y").strftime("%Y-%m-%d")
                }
            },
            {
                "fullUrl": "urn:uuid:practitioner-1",
                "resource": {
                    "resourceType": "Practitioner",
                    "id": "practitioner-1",
                    "name": [
                        {
                            "text": data["attending_physician"]
                        }
                    ]
                }
            }
        ]
    }

    # Add Conditions
    for i, diag in enumerate(data["primary_diagnosis"] + data["secondary_diagnoses"]):
        fhir_bundle["entry"].append({
            "fullUrl": f"urn:uuid:condition-{i+1}",
            "resource": {
                "resourceType": "Condition",
                "id": f"condition-{i+1}",
                "subject": {"reference": "Patient/patient-1"},
                "code": {"text": diag}
            }
        })

    # Add Procedures
    for i, proc in enumerate(data["procedures"]):
        fhir_bundle["entry"].append({
            "fullUrl": f"urn:uuid:procedure-{i+1}",
            "resource": {
                "resourceType": "Procedure",
                "id": f"procedure-{i+1}",
                "subject": {"reference": "Patient/patient-1"},
                "code": {"text": proc},
                "performedDateTime": data["admission_date"]
            }
        })

    # Add Medications
    for i, med in enumerate(data["medications"]):
        fhir_bundle["entry"].append({
            "fullUrl": f"urn:uuid:medication-{i+1}",
            "resource": {
                "resourceType": "MedicationStatement",
                "id": f"medication-{i+1}",
                "subject": {"reference": "Patient/patient-1"},
                "medicationCodeableConcept": {"text": med}
            }
        })

    # Add Follow-up Appointments
    for i, appt in enumerate(data["follow_up_appointments"]):
        fhir_bundle["entry"].append({
            "fullUrl": f"urn:uuid:appointment-{i+1}",
            "resource": {
                "resourceType": "Appointment",
                "id": f"appointment-{i+1}",
                "status": "booked",
                "description": appt,
                "start": data["discharge_date"]
            }
        })

    return fhir_bundle

def save_fhir_json(fhir_data, filename="fhir_bundle_PYTHON.json"):
    """Saves the FHIR JSON to a file."""
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(fhir_data, f, indent=4)
    print(f"✅ FHIR JSON saved as {filename}")

# Main Execution
pdf_path = "Discharge_Summary_John_Doe_new.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
structured_data = parse_discharge_summary(extracted_text)
fhir_json = generate_fhir_json(structured_data)
save_fhir_json(fhir_json)


✅ FHIR JSON saved as fhir_bundle_PYTHON.json


In [None]:
!pip install deepdiff jsonschema


Collecting deepdiff
  Downloading deepdiff-8.2.0-py3-none-any.whl.metadata (6.3 kB)
Collecting orderly-set<6,>=5.3.0 (from deepdiff)
  Downloading orderly_set-5.3.0-py3-none-any.whl.metadata (6.2 kB)
Downloading deepdiff-8.2.0-py3-none-any.whl (83 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/83.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.7/83.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading orderly_set-5.3.0-py3-none-any.whl (12 kB)
Installing collected packages: orderly-set, deepdiff
Successfully installed deepdiff-8.2.0 orderly-set-5.3.0


## Convert json to text

In [None]:
# import json

# def extract_patient(resource):
#     """Extracts Patient details."""
#     name = " ".join(resource.get("name", [{}])[0].get("given", [""])) + " " + resource.get("name", [{}])[0].get("family", "")
#     dob = resource.get("birthDate", "Unknown")
#     gender = resource.get("gender", "Unknown")
#     mrn = resource.get("identifier", [{}])[0].get("value", "N/A")
#     return name, dob, gender, mrn

# def extract_encounter(resource):
#     """Extracts Admission and Discharge details."""
#     admission_date = resource.get("period", {}).get("start", "Unknown")
#     discharge_date = resource.get("period", {}).get("end", "Unknown")
#     physician = resource.get("participant", [{}])[0].get("individual", {}).get("display", "Unknown")
#     return admission_date, discharge_date, physician

# def extract_conditions(conditions):
#     """Extracts primary and secondary diagnoses."""
#     primary = []
#     secondary = []
#     for condition in conditions:
#         code = condition.get("code", {}).get("text", "Unknown Condition")
#         if condition.get("category", [{}])[0].get("coding", [{}])[0].get("code", "") == "primary":
#             primary.append(code)
#         else:
#             secondary.append(code)
#     return primary, secondary

# def extract_procedures(procedures):
#     """Extracts performed procedures."""
#     return [f"{proc.get('code', {}).get('text', 'Unknown Procedure')} ({proc.get('performedDateTime', 'Unknown Date')})" for proc in procedures]

# def extract_vitals(observations):
#     """Extracts vital signs from Observation resources."""
#     vitals = {}
#     for obs in observations:
#         obs_type = obs.get("code", {}).get("text", "Unknown Observation")
#         value = obs.get("valueQuantity", {}).get("value", "N/A")
#         unit = obs.get("valueQuantity", {}).get("unit", "")
#         vitals[obs_type] = f"{value} {unit}"
#     return vitals

# def extract_medications(med_requests):
#     """Extracts discharge medications."""
#     return [med.get("medicationCodeableConcept", {}).get("text", "Unknown Medication") for med in med_requests]

# def extract_followups(appointments):
#     """Extracts follow-up appointments."""
#     return [f"{appt.get('type', {}).get('text', 'Follow-up')} on {appt.get('date', 'Unknown')}" for appt in appointments]

# def generate_discharge_summary(fhir_json):
#     """Generates a discharge summary from FHIR data."""
#     resources = [entry["resource"] for entry in fhir_json.get("entry", [])]

#     patient = next((r for r in resources if r["resourceType"] == "Patient"), {})
#     encounter = next((r for r in resources if r["resourceType"] == "Encounter"), {})
#     conditions = [r for r in resources if r["resourceType"] == "Condition"]
#     procedures = [r for r in resources if r["resourceType"] == "Procedure"]
#     observations = [r for r in resources if r["resourceType"] == "Observation"]
#     medications = [r for r in resources if r["resourceType"] == "MedicationRequest"]
#     appointments = [r for r in resources if r["resourceType"] == "Appointment"]

#     name, dob, gender, mrn = extract_patient(patient)
#     admission_date, discharge_date, physician = extract_encounter(encounter)
#     primary_dx, secondary_dx = extract_conditions(conditions)
#     performed_procedures = extract_procedures(procedures)
#     vitals = extract_vitals(observations)
#     discharge_meds = extract_medications(medications)
#     follow_ups = extract_followups(appointments)

#     primary_dx_text = "- " + "\n- ".join(primary_dx) if primary_dx else "Unknown"
#     secondary_dx_text = "- " + "\n- ".join(secondary_dx) if secondary_dx else "None"
#     performed_procedures_text = "- " + "\n- ".join(performed_procedures) if performed_procedures else "None"
#     follow_ups_text = "- " + "\n- ".join(follow_ups) if follow_ups else "None"

#     summary = f"""
# DISCHARGE SUMMARY
# Patient Details:
# Name: {name}
# Date of Birth: {dob}
# Gender: {gender}
# Medical Record Number: {mrn}
# Admission Date: {admission_date}
# Discharge Date: {discharge_date}
# Attending Physician: {physician}
# --------------------------------------------------
# Primary Diagnosis:
# {primary_dx_text}
# Secondary Diagnoses:
# {secondary_dx_text}
# Procedures Performed:
# {performed_procedures_text}
# --------------------------------------------------
# Vital Signs on Admission:
# {json.dumps(vitals, indent=4)}
# --------------------------------------------------
# Medications on Discharge:
# {json.dumps(discharge_meds, indent=4)}
# --------------------------------------------------
# Follow-up Appointments:
# {follow_ups_text}
# --------------------------------------------------
# Discharging Physician:
# {physician}
# --------------------------------------------------
# End of Discharge Summary
#     """
#     return summary.strip()


# if __name__ == "__main__":
#     with open("/content/fhir_bundle.json", "r", encoding="utf-8") as file:
#         fhir_data = json.load(file)

#     discharge_summary = generate_discharge_summary(fhir_data)
#     print(discharge_summary)


DISCHARGE SUMMARY
Patient Details:
Name: John Doe
Date of Birth: 1985-03-15
Gender: male
Medical Record Number: 123456789
Admission Date: Unknown
Discharge Date: Unknown
Attending Physician: Unknown
--------------------------------------------------
Primary Diagnosis:
Unknown
Secondary Diagnoses:
None
Procedures Performed:
None
--------------------------------------------------
Vital Signs on Admission:
{
    "Unknown Observation": "160 mg/dL"
}
--------------------------------------------------
Medications on Discharge:
[]
--------------------------------------------------
Follow-up Appointments:
None
--------------------------------------------------
Discharging Physician:
Unknown
--------------------------------------------------
End of Discharge Summary


In [None]:
# !pip install reportlab

Collecting reportlab
  Downloading reportlab-4.3.1-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.3.1-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m33.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.3.1


In [None]:
# from reportlab.lib.pagesizes import A4
# from reportlab.pdfgen import canvas
# import json

# def draw_text(c, text, x, y, font="Helvetica", size=12, bold=False):
#     if bold:
#         c.setFont("Helvetica-Bold", size)
#     else:
#         c.setFont(font, size)
#     c.drawString(x, y, text)

# def generate_pdf_from_fhir(json_file, output_pdf):
#     with open(json_file, "r") as f:
#         data = json.load(f)

#     patient_info = {}
#     conditions = []
#     procedures = []
#     observations = []
#     medications = []
#     appointments = []

#     for entry in data.get("entry", []):
#         resource = entry.get("resource", {})

#         if resource.get("resourceType") == "Patient":
#             patient_info = {
#                 "name": f"{resource['name'][0]['given'][0]} {resource['name'][0]['family']}",
#                 "dob": resource.get("birthDate", "Unknown"),
#                 "gender": resource.get("gender", "Unknown"),
#                 "mrn": resource["identifier"][0]["value"]
#             }
#         elif resource.get("resourceType") == "Condition":
#             conditions.append(resource["code"]["coding"][0]["display"])
#         elif resource.get("resourceType") == "Procedure":
#             procedures.append(f"{resource['code']['coding'][0]['display']} ({resource.get('performedDateTime', 'Unknown')})")
#         elif resource.get("resourceType") == "Observation":
#             for component in resource.get("component", []):
#                 observations.append(f"{component['code']['coding'][0]['display']}: {component['valueQuantity']['value']} {component['valueQuantity']['unit']}")
#         elif resource.get("resourceType") == "MedicationStatement":
#             medications.append(resource["medicationCodeableConcept"]["coding"][0]["display"])
#         elif resource.get("resourceType") == "Appointment":
#             appointments.append(f"{resource['description']} - {resource['start']}")

#     c = canvas.Canvas(output_pdf, pagesize=A4)
#     width, height = A4
#     y = height - 50

#     draw_text(c, "DISCHARGE SUMMARY", 200, y, bold=True, size=16)
#     y -= 30

#     draw_text(c, "Patient Details:", 50, y, bold=True)
#     y -= 20
#     draw_text(c, f"Name: {patient_info.get('name', 'Unknown')}", 50, y)
#     y -= 20
#     draw_text(c, f"Date of Birth: {patient_info.get('dob', 'Unknown')}", 50, y)
#     y -= 20
#     draw_text(c, f"Gender: {patient_info.get('gender', 'Unknown')}", 50, y)
#     y -= 20
#     draw_text(c, f"Medical Record Number: {patient_info.get('mrn', 'Unknown')}", 50, y)
#     y -= 30

#     draw_text(c, "Primary Diagnosis:", 50, y, bold=True)
#     y -= 20
#     for cond in conditions:
#         draw_text(c, f"- {cond}", 70, y)
#         y -= 20
#     y -= 10

#     draw_text(c, "Procedures Performed:", 50, y, bold=True)
#     y -= 20
#     for proc in procedures:
#         draw_text(c, f"- {proc}", 70, y)
#         y -= 20
#     y -= 10

#     draw_text(c, "Vital Signs on Admission:", 50, y, bold=True)
#     y -= 20
#     for obs in observations:
#         draw_text(c, f"- {obs}", 70, y)
#         y -= 20
#     y -= 10

#     draw_text(c, "Medications on Discharge:", 50, y, bold=True)
#     y -= 20
#     for med in medications:
#         draw_text(c, f"- {med}", 70, y)
#         y -= 20
#     y -= 10

#     draw_text(c, "Follow-up Appointments:", 50, y, bold=True)
#     y -= 20
#     for app in appointments:
#         draw_text(c, f"- {app}", 70, y)
#         y -= 20
#     y -= 30

#     draw_text(c, "Discharging Physician:", 50, y, bold=True)
#     y -= 20
#     draw_text(c, "Dr. Emily Carter, MD", 70, y)

#     c.save()
#     print(f"PDF saved as {output_pdf}")

# # Example usage
# generate_pdf_from_fhir("Discarge_summary.json", "output_discharge_summary.pdf")


PDF saved as output_discharge_summary.pdf
