# Thesis Code Part 1: Preprocess

## Import Packages

In [1]:
import re
import os
import pandas as pd
import numpy as np
import tqdm
import pydicom as dcm
from PIL import Image
import matplotlib.pyplot as plt

## UCSF CXR Dataset

In [2]:
# Read report data
data = pd.read_csv('/mnt/sohn2022/UCSF_secure_cxr_data/Secure_UCSF_CXR_01012022_to_02132023.csv',index_col=0)
data = data.iloc[:,:13]
data.head()

Unnamed: 0,Organization,Point of Care,Source System,Accession Number,Modality,Exam Code,Exam Description,CPT Code,Report Text,Is Stat,Patient Status,Patient Sex,Patient Age
0,MBXR1,RAD X-RAY MB,EPIC,10022207761,XR,DCHE2,XR CHEST 2 VIEWS PA AND LATERAL,71020,XR CHEST 2 VIEWS PA AND LATERAL 6/30/2022 11...,False,Emergency,Male,39.0
1,PDR11,RAD X-RAY PARN,EPIC,10022207760,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 6/30/2022 11:53 PM\nHISTO...,False,Emergency,Male,41.0
2,PDR9,RAD X-RAY PARN,EPIC,10022207745,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 6/30/2022 11:36 PM\nHISTO...,False,Inpatient,Female,25.0
3,PDR9,RAD X-RAY PARN,EPIC,10022207747,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 6/30/2022 11:26 PM\nHISTO...,False,Inpatient,Female,60.0
4,MBP9,RAD X-RAY MB,EPIC,10022207748,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 6/30/2022 11:21 PM\nINDIC...,False,Inpatient,Female,0.27


In [3]:
# Reports that need further communication
comm = data[data['Report Text'].str.contains('communicate|discuss')]
comm['comm'] = 1

# combined
sample = data.drop(list(comm.index)).sample(2000)
sample['comm'] = 0
data = pd.concat([sample,comm])
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comm['comm'] = 1


Unnamed: 0,Organization,Point of Care,Source System,Accession Number,Modality,Exam Code,Exam Description,CPT Code,Report Text,Is Stat,Patient Status,Patient Sex,Patient Age,comm
14359,MBP4,RAD X-RAY MB,EPIC,10022051976,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 5/3/2022 6:07 AM\nINDICAT...,False,Inpatient,Male,0.08,0
2876,M02,RAD X-RAY PARN,EPIC,10022729845,XR,DCHE2,XR CHEST 2 VIEWS PA AND LATERAL,71020,XR CHEST 2 VIEWS PA AND LATERAL 12/19/2022 1...,False,Inpatient,Female,64.0,0
8885,MBXR1,RAD X-RAY MB,EPIC,10022781214,XR,DCHE2,XR CHEST 2 VIEWS PA AND LATERAL,71020,XR CHEST 2 VIEWS PA AND LATERAL 1/10/2023 8:...,False,Outpatient,Male,28.0,0
15515,E01,RAD X-RAY PARN,EPIC,10021786863,XR,DCHE2,XR CHEST 2 VIEWS PA AND LATERAL,71020,INDICATION:\nXR CHEST 2 VIEWS PA AND LATERAL ...,False,Emergency,Female,42.0,0
7275,PDR9,RAD X-RAY PARN,EPIC,10022677396,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 12/3/2022 10:00 PM\nHISTO...,False,Inpatient,Male,86.0,0


## Preprocess

In [4]:
# code from Adrian
def preprocess_findings(report_text):     
    findings_impression = re.search(r"((?<=\nFINDINGS\/IMPRESSION:|FINDINGS\/IMPRESSION:\n)(.|\n)*)", report_text)
    if findings_impression:
        findings_impression = findings_impression.group(0).split('"')[0]
        findings_impression = re.sub(r"(?=(Impression discussed|Further impression|Final impression|Attestation|Radiologist|Electronically|This change was|Report dictated))(.|\n)*", "", findings_impression)
        return findings_impression.strip()
    return 'None'

In [5]:
# code from Adrian
def preprocess_dcm_path(i):   
    row = data.iloc[i]
    report_text = row['Report Text']

    accession_number = str(row['Accession Number'])
    patient_folder = f"{data_folder}/raw/{accession_number}" 
    
    patient_folder_files = os.listdir(patient_folder)

    for j in range(len(patient_folder_files)):
        study = patient_folder_files[j]
        if not os.path.isdir(f"{patient_folder}/{study}"):
            return 'None'
        image_files = os.listdir(f"{patient_folder}/{study}")
        ds = dcm.dcmread(f"{patient_folder}/{study}/{image_files[0]}")

        if 'Image Storage' in ds.SOPClassUID.name and 'ViewPosition' in ds and ds.ViewPosition != 'LL': 
            return f"{patient_folder}/{study}/{image_files[0]}"
    return 'None'

In [6]:
data_folder = '/mnt/sohn2022/UCSF_secure_cxr_data'

data['Downloaded'] = data['Accession Number'].apply(str).isin(os.listdir(data_folder + '/raw'))
data = data[data['Downloaded']]
#data = data.drop(['Unnamed: 0'], axis=1)

print('Caption Preprocessing\n===============')
captions = []
for i in tqdm.tqdm(range(len(data))):
    captions.append(preprocess_findings(data['Report Text'].iloc[i]))
data['Caption'] = captions

data = data[data['Caption'].apply(len) < 400].reset_index(drop=True)

print('DCM Path Preprocessing\n===============')
dcm_paths = []
for i in tqdm.tqdm(range(len(data))):
    dcm_paths.append(preprocess_dcm_path(i))
data['DCM Path'] = dcm_paths
data = data[data['Caption'] != 'None']
data = data[data['DCM Path'] != 'None']

data = data.reset_index(drop=True)

Caption Preprocessing


100%|████████████████████████████████████| 3461/3461 [00:00<00:00, 26712.95it/s]


DCM Path Preprocessing


100%|███████████████████████████████████████| 2883/2883 [04:42<00:00, 10.21it/s]


In [7]:
data.to_csv('data_processed.csv', index=False)