In [1]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from transformers import set_seed

set_seed(42)

In [2]:
login(token='XXXXX') # huggingface token

In [3]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
dtype = torch.float16

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id,token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    temperature=0.6,
    top_p=0.6,
    torch_dtype=dtype,
    token=True,
    do_sample=True
)
model.config.pad_token_id = tokenizer.pad_token_id

In [17]:
# generate impression snippets given a positive pneumonia prediction and using Grad-CAM-derived localisations as input

def generate_report(locs):
    chat = [
          {"role": "user", 
           "content": f'''
           ### ROLE ###
           YOU ARE A HIGHLY ACCLAIMED RADIOLOGIST BASED IN THE UNITED KINGDOM (UK) WITH DECADES OF EXPERIENCE IN GENERATING CONCISE, OBJECTIVE SUMMARY SNIPPETS IN THE STYLE OF UK CHEST X-RAY REPORTS.
           
           ### TASK OVERVIEW ###
           GIVEN THE FOLLOWING INPUT DATA:
           - LOCALISATION: {locs}
           - RADIOGRAPHIC FINDING: OPACITY
           - DIAGNOSIS: PNEUMONIA

           GENERATE A ONE-SENTENCE RADIOLOGY REPORT SNIPPET THAT SUMMARISES THESE FINDINGS. THE OUTPUT SHOULD USE CLEAR, STANDARD UK TERMINOLOGY AND FOLLOW A FORMAT SIMILAR TO THE IMPRESSION SECTION OF A RADIOLOGY REPORT. 

           ### OUTPUT FORMAT ###
           GENERATE THE OUTPUT ACCORDING TO THE FOLLOWING XML FORMAT: <REPORT>REPORT TEXT</REPORT>

           INPUT EXAMPLE 1:
           - LOCALISATION: MIDDLE & LOWER RIGHT ZONES
           - RADIOGRAPHIC FINDING: OPACITY
           - DIAGNOSIS: PNEUMONIA
            
           EXAMPLE OUTPUT 1:
           <REPORT>"Middle to lower right zone opacity suggestive of pneumonia."</REPORT>

           INPUT EXAMPLE 2:
           - LOCALISATION: UPPER LEFT ZONE
           - RADIOGRAPHIC FINDING: OPACITY
           - DIAGNOSIS: PNEUMONIA

           EXAMPLE OUTPUT 2:
           <REPORT>"Upper left zone opacity consistent with pneumonia."</REPORT>

           INPUT EXAMPLE 3:
           - LOCALISATION: UPPER RIGHT & MIDDLE LEFT ZONES
           - RADIOGRAPHIC FINDING: OPACITY
           - DIAGNOSIS: PNEUMONIA

           EXAMPLE OUTPUT 3:
           <REPORT>"Upper right and middle left zone opacities in keeping with pneumonia."</REPORT>

           ### FINAL NOTES ###
           - IF THE AFFECTED ZONES ARE SYMMETRICAL IN BOTH LUNGS, GROUP THEM TOGETHER WITH THE TERM 'BILATERAL'. FOR EXAMPLE, MIDDLE LEFT & RIGHT ZONE OPACITIES -> BILATERAL MID ZONE OPACITIES.
           - OTHERWISE, IF THE AFFECTED ZONES ARE ANATOMICALLY ADJACENT, GROUP THEM TOGETHER (E.G., MIDDLE & LOWER RIGHT ZONE OPACITY -> MIDDLE TO LOWER RIGHT ZONE OPACITY, UPPER & MIDDLE RIGHT ZONE OPACITY -> UPPER TO MID RIGHT ZONE OPACITY).
           - ENSURE THE OUTPUT FORMAT IS CONSISTENT AND EXACTLY MATCHES THE EXAMPLES ABOVE. ENSURE YOUR RESPONSE IS SUCCINCT, OBJECTIVE, AND CLINICALLY ACTIONABLE.'''
          }
    ]
    
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    
    inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    outputs = model.generate(**inputs, max_new_tokens=4096, do_sample=False)
    
    data = tokenizer.decode(outputs[0])
    response = data.split('</think>')[-1].replace('<｜end▁of▁sentence｜>', '')
    
    return response

In [None]:
# example

response = generate_report('Upper and mid right zone')
re.findall(r'<REPORT>(.*?)</REPORT>', response)[-1]