In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import sentencepiece
import hf_xet


In [None]:
###################################### Generate note#################################################################################################
# Load transcripts and save as global variables
save_dir = 
n =10
for i in range(1, n + 1):
    filename = f'encounter_transcript_{i}.txt'
    filepath = os.path.join(save_dir, filename)
    if os.path.exists(filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            globals()[f'encounter_transcript_{i}'] = f.read()
        print(f"Loaded {filepath} into variable encounter_transcript_{i}")
    else:
        print(f"File {filepath} not found.")

In [None]:
instructions = f"""You are a medical documentation assistant. Based on the following doctor-patient conversation, create a structured medical note with the following sections:

1. History of Present Illness (HPI): should be in format of a list of problems with details for each problem in a short paragraph. Do not add other subheadings to organize the information.
Problem 1: HPI Details
Problem 2: HPI Details
Problem n: HPI details
2. Visit Diagnoses
-Problem 1
-Problem 2
-Problem n  
3. Assessment and Plan
Problem 1
-Assessment:
-Plan:
Problem 2
-Assessment:
-Plan:
Problem n
-Assessment:
-Plan:
4. Orders #This section should only include orders that would need to be placed in the EMR system. Include medications, labs, imaging, referrals, etc. Do not include any other information such as counseling.
Problem 1
-Order 1
-Order 2
-Order n
Problem 2
-Order 1
-Order 2
-Order n
Problem n
-Order 1
-Order 2
-Order n
Please format the output with clear section headers as detailed above without addition of other subheadings/sections and professional medical documentation style.\n"""

footer = "Medical note:\n\n"


In [None]:
# load Qwen-1.5B
model_name = "Qwen/Qwen2.5-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

In [None]:
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

In [None]:
# Revised prompting
for i in range(1, n + 1):
    transcript_content = globals()[f'encounter_transcript_{i}']
    prompt = f"{instructions}\n\nTranscript:\n{transcript_content}\n\n{footer}"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=False)
    
    # Move inputs to the same device as the model (GPU)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1200,  # Uncomment this to limit output length
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    # Decode only the new tokens (skip the input prompt)
    response_text = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
    globals()[f'encounter_summary_{i}_{model_name[5:]}'] = response_text
    print(f"Generated encounter_summary_{i}_{model_name[5:]}")


In [None]:
save_dir_summ = 
os.makedirs(save_dir_summ, exist_ok=True)
for i in range(1, n+1):
    # Save the summary to a file
    filename = f'encounter_summary_{i}_{model_name[5:]}.txt'
    filepath = os.path.join(save_dir_summ, filename)
    
    # Delete existing file if it exists
    if os.path.exists(filepath):
        os.remove(filepath)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(f"{filename}:\n\n")
        f.write(globals()[f'encounter_summary_{i}_{model_name[5:]}'])
    print(f"Saved {filepath}")

In [None]:
# Clear GPU memory - run this after your generation loop
import gc
import torch

# Delete model and tokenizer from memory
if 'model' in globals():
    del model
if 'tokenizer' in globals():
    del tokenizer

# Clear any input tensors that might be lingering
if 'inputs' in globals():
    del inputs
if 'outputs' in globals():
    del outputs

# Force garbage collection
gc.collect()

# Clear GPU cache if CUDA is available
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU memory cleared")
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
else:
    print("CPU memory cleared")

In [None]:
# load Qwen-3B

# Configure 8-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=False,
)

# load Qwen-3B in int8
model_name = "Qwen/Qwen2.5-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=quantization_config,
    #device_map="auto"  # Automatically handle device placement
)

print(f"{model_name} loaded in 8-bit quantization")
