In [1]:
import pandas as pd, numpy as np
import json, re, os, torch
from transformers import set_seed, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

set_seed(42)

In [9]:
login(token='XXXXX') # huggingface token

In [11]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
dtype = torch.float16

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id,token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    temperature=0.6,
    top_p=0.6,
    torch_dtype=dtype,
    token=True,
)
model.config.pad_token_id = tokenizer.pad_token_id

# Radiology Reports

In [48]:
# parsing radiology report text from JSON files, focusing on findings where available

path = '/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/files/json/files_JSON/'
files = os.listdir(path)
  
preferred = ['FINDING', 'IMPRESSION', 'CONCLUSION']
fallback = ['HISTORY', 'INDICATION', 'COMPARISON', 'EXAMINATION-UNMAPPED', 'REASON FOR EXAM', 'REFERENCE EXAM']

results = []
for file in files:
    with open(os.path.join(path, file), 'r') as f:
        data = json.load(f)

    for document in data.get('documents', []):
        image_ids = document.get('infons', {}).get('image_identifier', [])
        passages = document.get('passages', [])

        # check if any passage belongs to preferred sections
        has_preferred = any(any(pref in passage.get('infons', {}).get('section_title_1', '').upper() for pref in preferred) for passage in passages)

        report_text = ''
        if has_preferred:
            for passage in passages:
                section_title = passage.get('infons', {}).get('section_title_1', '').upper()
                if any(pref in section_title for pref in preferred):
                    report_text += f' {section_title}: {re.sub(r" {2,}", " ", passage.get("text", "N/A"))}'
                elif section_title == 'UNMAPPED-CATEGORY':
                    report_text += ' ' + re.sub(r" {2,}", " ", passage.get("text", "N/A"))

        else:
            for passage in passages:
                section_title = passage.get('infons', {}).get('section_title_1', '').upper()
                if any(fb in section_title for fb in fallback):
                    report_text += f' {section_title}: {re.sub(r" {2,}", " ", passage.get("text", "N/A"))}'
                elif section_title == 'UNMAPPED-CATEGORY':
                    report_text += ' ' + re.sub(r" {2,}", " ", passage.get("text", "N/A"))
        
        results.append({
            'image_ids': ''.join(image_ids).split(' ; '),
            'text': report_text[1:]})

In [None]:
results

In [None]:
mimic = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full.csv')
mimic['dicom_id'] = mimic.Path.apply(lambda path: os.path.basename(path)[:-4])

mimic

In [None]:
mimic['Report'] = ['N/A'] * len(mimic)

for entry in results:
    for image_id in entry['image_ids']:
        mimic.loc[mimic['dicom_id'] == image_id, 'Report'] = entry['text']

mimic
#mimic.to_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports.csv', index=False)

In [None]:
mimic = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports.csv')

mimic.fillna({'Report': 'No report available'}, inplace=True)
mimic

In [None]:
# filtering reports for pneumonia-related terms

conditions = (mimic.Report.str.contains('pneumonia', case=False, regex=False)) | (mimic.Report.str.contains('PNA', case=False, regex=False)) | (mimic.Report.str.contains('HAP', case=True, regex=False)) | (mimic.Report.str.contains('CAP', case=True, regex=False)) | (mimic.Report.str.contains('UIP', case=True, regex=False)) | (mimic.Report.str.contains('AIP', case=True, regex=False)) | (mimic.Report.str.contains('DIP', case=True, regex=False)) | (mimic.Report.str.contains('BOOP', case=True, regex=False)) | (mimic.Report.str.contains('VAP', case=True, regex=False)) | (mimic.Report.str.contains('infectio', case=False, regex=False))

mimic['Pneumonia_Relevant'] = ['N/A'] * len(mimic)
mimic.loc[conditions, 'Pneumonia_Relevant'] = 1
mimic.loc[~conditions, 'Pneumonia_Relevant'] = 0

mimic

In [None]:
mimic[mimic['Pneumonia_Relevant'] == 1]

# LLM Prompt

In [16]:
# prompt to classify pneumonia status and extract positional info from report text

def analyse_mimic_report(report):
    chat = [
          {"role": "user", 
           "content": f'''
           ### ROLE ###
           YOU ARE A HIGHLY ACCLAIMED RADIOLOGIST BASED IN THE UNITED KINGDOM WITH DECADES OF EXPERIENCE IN ANALYSING ELECTRONIC HEALTH RECORDS (EHRs) AND RADIOLOGICAL REPORTS. YOUR TASK IS TO ANALYSE AN EHR REPORT AND PERFORM A SERIES OF INSTRUCTIONS BASED ON THE DIAGNOSIS OF PNEUMONIA.
           
           ### TASK OVERVIEW ###
            YOUR TASK IS TO ANALYSE THE ENTIRE RADIOLOGICAL TEXT PROVIDED AND PERFORM THESE INSTRUCTIONS:
            1. DETERMINE WHETHER THE PATIENT HAS ACTIVE PNEUMONIA AT THE TIME OF THE STUDY:
               - CLASSIFY THE REPORT AS:
                 - "1" FOR PNEUMONIA-POSITIVE (PATIENT CURRENTLY HAS PNEUMONIA, OR NO OTHER CONDITIONS ARE POSSIBLE).
                 - "0" FOR PNEUMONIA-NEGATIVE (CLEARLY STATES NO PNEUMONIA OR NO INFECTION).
                 - "-1" FOR PNEUMONIA-UNCERTAIN (PNEUMONIA MENTIONED WITH UNCERTAINTY, AMBIGUITY, OR POSSIBILITY).
               - IMPORTANTLY, EVEN IF THE REPORT EXPLICITLY STATES PNEUMONIA, IT DOES NOT AUTOMATICALLY MEAN THE CLASSIFICATION IS PNEUMONIA-POSITIVE; YOU MUST CONSIDER THE CONTEXTUAL INFORMATION TO UNDERSTAND WHETHER THE MENTION IS POSITIVE, NEGATIVE, OR UNCERTAIN.
            
            2. EXTRACT THE PNEUMONIA LOCATION STRING(S):
                - IF CLASSIFIED AS "1" (PNEUMONIA-POSITIVE) OR "-1" (PNEUMONIA-UNCERTAIN), EXTRACT THE POSITIONAL INFORMATION OF THE ACTIVE PNEUMONIA AS A LIST IF PROVIDED (E.G., ["UPPER LOBES BILATERALLY", "RIGHT MIDDLE LOBE"]) EXACTLY AS WRITTEN IN THE TEXT.
                 - OUTPUT "None" IF NO LOCATION IS MENTIONED.
                - IF CLASSIFIED AS "0", OUTPUT "None" FOR THE LOCATION, REGARDLESS OF WHETHER A LOCATION IS MENTIONED.
            
            ### PNEUMONIA-POSITIVE ("1") EXAMPLES ###
            - Patient shows signs of pneumonia.
            - There is evidence of pneumonia.
            - Pneumonia has worsened since the prior radiograph.
            - Pneumonia has almost completely resolved, but is still present.
            - Pneumonia is healing, but still present.
            - Pre-existing pneumonia has improved.
            - Improved, but not yet resolved, pneumonia.
            - There has been partial clearing of the pneumonia.

            ### PNEUMONIA-NEGATIVE ("0") EXAMPLES ###
            - No pneumonia.
            - No definite pneumonia.
            - No evidence of pneumonia.
            - No evidence to suggest infection.
            - Interval resolution of patient's pneumonia.
            - Pneumonia has resolved.
            - Pneumonia is mentioned as having resolved.
            - Resolution of pneumonia.
            - There has been complete clearing of the pneumonia.

            ### PNEUMONIA-UNCERTAIN ("-1") EXAMPLES ###
            - <symptom> may reflect <other condition>, less likely pneumonia.
            - <symptom> may reflect <other condition>, though infection is not excluded in the correct clinical setting.
            - <symptom> more typical of <other condition> than pneumonia.
            - <symptom> could reflect <other condition> or secondary process such as pneumonia.
            - <other condition> more likely than pneumonia.
            - Differential includes pneumonia.
            - Could be <other condition>, but should be followed closely to exclude pneumonia.
            - Pneumonia is difficult to exclude.
            - Pneumonia could also be considered.

            ### OUTPUT FORMAT ###
            GENERATE THE OUTPUT ACCORDING TO THE FOLLOWING XML FORMAT: OPENING TAG, RESULT, CLOSING TAG.

            INPUT EXAMPLE 1:
            "Patient shows signs of pneumonia localized to the right lower lobe."
            
            EXPECTED OUTPUT EXAMPLE 1:
            <EVALUATION>1</EVALUATION>
            <LOCATION>["right lower lobe"]</LOCATION>

            INPUT EXAMPLE 2:
            "The known pre-existing left upper lobe pneumonia has substantially improved."

            EXPECTED OUTPUT EXAMPLE 2:
            <EVALUATION>1</EVALUATION>
            <LOCATION>["left upper lobe"]</LOCATION>

            INPUT EXAMPLE 3:
            "No vascular congestion or acute focal pneumonia."

            EXPECTED OUTPUT EXAMPLE 3:
            <EVALUATION>0</EVALUATION>
            <LOCATION>None</LOCATION>

            INPUT EXAMPLE 4:
            "Resolution of left lower lobe pneumonia."

            EXPECTED OUTPUT EXAMPLE 4:
            <EVALUATION>0</EVALUATION>
            <LOCATION>None</LOCATION>

            INPUT EXAMPLE 5:
            "Subtle increase in lower lung opacities may reflect atelectasis, less likely early pneumonia."

            EXPECTED OUTPUT EXAMPLE 5:
            <EVALUATION>-1</EVALUATION>
            <LOCATION>["lower lung"]</LOCATION>

            INPUT EXAMPLE 6:
            "Retrocardiac patchy opacity may reflect atelectasis though infection is not excluded in the correct clinical setting."

            EXPECTED OUTPUT EXAMPLE 6:
            <EVALUATION>-1</EVALUATION>
            <LOCATION>["Retrocardiac"]</LOCATION>

            ### WHAT NOT TO DO ###
            - DO NOT ASSUME OR INTERPRET ANY INFORMATION NOT EXPLICITLY PRESENT IN THE TEXT.
            - DO NOT MODIFY THE STRING OF THE LOCATION; IT HAS TO BE THE SAME CHARACTERS.
            - DO NOT INCLUDE FURTHER INFORMATION OTHER THAN THE POSITIONAL INFORMATION FOR THE LOCATION.
            - DO NOT OUTPUT LOCATIONS THAT ARE NOT EXPLICITLY MENTIONED IN THE TEXT IN REFERENCE TO PNEUMONIA.
            - DO NOT INCLUDE INFORMATION ABOUT THE PNEUMONIA OUTSIDE OF ITS POSITIONAL INFORMATION.
            - DO NOT PRODUCE OUTPUT WITHOUT STRICTLY FOLLOWING THE SPECIFIED XML FORMAT: OPENING TAG, RESULT, CLOSING TAG.
            
            ### FINAL NOTE ###
            ENSURE THE OUTPUT FORMAT IS CONSISTENT AND EXACTLY MATCHES THE EXAMPLES ABOVE. DO NOT OUTPUT ANY TEXT OTHER THAN THE PROVIDED FORMATTED OUTPUT.

            ### INPUT TEXT ###
            Here is the radiological text to classify:
            {report}'''
          }
    ]
    
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    outputs = model.generate(**inputs, max_new_tokens=4096, do_sample=False)
    
    data = tokenizer.decode(outputs[0])
    response = data.split('</think>')[-1].replace('<｜end▁of▁sentence｜>', '')
    
    return response

def extract_pneu_info(mimic_row):
    print(f'Report #{mimic_row.name + 1}')
    if mimic_row['Pneumonia_Relevant'] == 1:
        return analyse_mimic_report(mimic_row['Report'])
    return 'N/A'

In [None]:
# 2 GPUs for processing

mid = len(mimic)//2
mimic[:mid].to_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_0.csv', index=False)
mimic[mid:].to_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_1.csv', index=False)

In [None]:
# extract sentences mentioning pneumonia-related terms for simpler human annotation

def extract_pneumonia_sentences(report):
    # split report into sentences
    sentences = re.split(r'(?<=[.!?])\s*', report)
    
    # keyword, case sensitive
    keywords = [
        ('pneumonia', False),
        ('PNA', False),
        ('HAP', True),
        ('CAP', True),
        ('UIP', True),
        ('AIP', True),
        ('DIP', True),
        ('BOOP', True),
        ('VAP', True),
        ('infectio', False)
    ]
    
    matching_sentences = []
    for sentence in sentences:
        for kw, case_sensitive in keywords:
            if case_sensitive:
                if kw in sentence:
                    matching_sentences.append(sentence)
                    break
            else:
                if kw.lower() in sentence.lower():
                    matching_sentences.append(sentence)
                    break
    
    return ' '.join(matching_sentences) if matching_sentences else 'N/A'

In [None]:
#mimic['Pneumonia_Sentences'] = mimic['Report'].apply(extract_pneumonia_sentences)
#mimic['Pneumonia_Sentences']

# Post Processing

In [None]:
df_0 = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_0_llm.csv')
df_1 = pd.read_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_1_llm.csv')

mimic = pd.concat([df_0, df_1], ignore_index=True, sort=False)
mimic

In [None]:
# parsing pneumonia class from LLM responses

mimic['LLM_class'] = mimic.LLM_Pneumonia.apply(lambda response : int(re.findall(r'EVALUATION>(.*?)</EVALUATION>', response.upper().replace(' ', ''))[-1]) if isinstance(response,str) else np.nan)
mimic

In [None]:
mimic.to_csv('/Data/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-pneumonia-full-wreports_llm_complete.csv', index=False)

In [None]:
# sample of false-positive rule-based NLP labels for pneumonia in MIMIC-CXR
# presumably, positive weight is given to 'suggest pneumonia' in phrases like 'no findings to suggest pneumonia'

nlp_positive = mimic[mimic.Pneumonia == 1]
nlp_positive[(nlp_positive.Report.str.contains('no findings to suggest pneumonia', case=False, regex=False)) | (nlp_positive.Report.str.contains('no findings to suggest infection', case=False, regex=False)) | (nlp_positive.Report.str.contains('no pneumonia', case=False, regex=False)) | (nlp_positive.Report.str.contains('no evidence of pneumonia', case=False, regex=False))]