# Thesis Code Part 1: Preprocess

## Import Packages

In [9]:
import re
import os
import pandas as pd
import numpy as np
import tqdm
import pydicom as dcm
from PIL import Image
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## UCSF CXR Dataset

In [2]:
# Read report data
data = pd.read_csv('/mnt/sohn2022/UCSF_secure_cxr_data/Secure_UCSF_CXR_01012022_to_02132023.csv',index_col=0)
data = data.iloc[:,:13]
data.head()

Unnamed: 0,Organization,Point of Care,Source System,Accession Number,Modality,Exam Code,Exam Description,CPT Code,Report Text,Is Stat,Patient Status,Patient Sex,Patient Age
0,MBXR1,RAD X-RAY MB,EPIC,10022207761,XR,DCHE2,XR CHEST 2 VIEWS PA AND LATERAL,71020,XR CHEST 2 VIEWS PA AND LATERAL 6/30/2022 11...,False,Emergency,Male,39.0
1,PDR11,RAD X-RAY PARN,EPIC,10022207760,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 6/30/2022 11:53 PM\nHISTO...,False,Emergency,Male,41.0
2,PDR9,RAD X-RAY PARN,EPIC,10022207745,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 6/30/2022 11:36 PM\nHISTO...,False,Inpatient,Female,25.0
3,PDR9,RAD X-RAY PARN,EPIC,10022207747,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 6/30/2022 11:26 PM\nHISTO...,False,Inpatient,Female,60.0
4,MBP9,RAD X-RAY MB,EPIC,10022207748,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 6/30/2022 11:21 PM\nINDIC...,False,Inpatient,Female,0.27


In [3]:
# Reports that need further communication
comm = data[data['Report Text'].str.contains('communicate|discuss')]
comm['comm'] = 1

# combined
sample = data.drop(list(comm.index)).sample(2000)
sample['comm'] = 0
data = pd.concat([sample,comm])
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comm['comm'] = 1


Unnamed: 0,Organization,Point of Care,Source System,Accession Number,Modality,Exam Code,Exam Description,CPT Code,Report Text,Is Stat,Patient Status,Patient Sex,Patient Age,comm
13427,PDR9,RAD X-RAY PARN,EPIC,10022303835,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 8/5/2022 9:28 PM\nHISTORY...,False,Inpatient,Female,47.0,0
20996,BOPCXR01,RAD X-RAY BOPC,EPIC,10021981603,XR,DCHE2,XR CHEST 2 VIEWS PA AND LATERAL,71020,XR CHEST 2 VIEWS PA AND LATERAL 4/6/2022 4:2...,False,Outpatient,Male,31.0,0
9245,BCH OAK X-RAY ROOM 2 X-RAY,BCH OAK DIAGNOSTIC IMAGING,EPIC,10022357851,XR,DCHE2,XR CHEST 2 VIEWS PA AND LATERAL,71020,XR CHEST 2 VIEWS PA AND LATERAL 8/24/2022 5:...,False,Emergency,Male,4.0,0
1086,MBP6,RAD X-RAY MB,EPIC,10022463203,XR,DCHE1,XR CHEST 1 VIEW AP,71010,XR CHEST 1 VIEW AP 9/27/2022 5:55 AM\nINDICA...,False,Inpatient,Male,0.48,0
18817,BCH OAK X-RAY ROOM 1 X-RAY,BCH OAK DIAGNOSTIC IMAGING,EPIC,10022241755,XR,DCHE2,XR CHEST 2 VIEWS PA AND LATERAL,71020,PROCEDURE: XR Chest 2 Views\nTECHNIQUE: Chest ...,False,Outpatient,Male,15.0,0


## Preprocess

In [4]:
# code from Adrian
def preprocess_findings(report_text):     
    findings_impression = re.search(r"((?<=\nFINDINGS\/IMPRESSION:|FINDINGS\/IMPRESSION:\n)(.|\n)*)", report_text)
    if findings_impression:
        findings_impression = findings_impression.group(0).split('"')[0]
        findings_impression = re.sub(r"(?=(Impression discussed|Further impression|Final impression|Attestation|Radiologist|Electronically|This change was|Report dictated))(.|\n)*", "", findings_impression)
        return findings_impression.strip()
    return 'None'

In [5]:
# code from Adrian
def preprocess_dcm_path(i):   
    row = data.iloc[i]
    report_text = row['Report Text']

    accession_number = str(row['Accession Number'])
    patient_folder = f"{data_folder}/raw/{accession_number}" 
    
    patient_folder_files = os.listdir(patient_folder)

    for j in range(len(patient_folder_files)):
        study = patient_folder_files[j]
        if not os.path.isdir(f"{patient_folder}/{study}"):
            return 'None'
        image_files = os.listdir(f"{patient_folder}/{study}")
        ds = dcm.dcmread(f"{patient_folder}/{study}/{image_files[0]}")

        if 'Image Storage' in ds.SOPClassUID.name and 'ViewPosition' in ds and ds.ViewPosition != 'LL': 
            return f"{patient_folder}/{study}/{image_files[0]}"
    return 'None'

In [6]:
data_folder = '/mnt/sohn2022/UCSF_secure_cxr_data'

data['Downloaded'] = data['Accession Number'].apply(str).isin(os.listdir(data_folder + '/raw'))
data = data[data['Downloaded']]
#data = data.drop(['Unnamed: 0'], axis=1)

print('Caption Preprocessing\n===============')
captions = []
for i in tqdm.tqdm(range(len(data))):
    captions.append(preprocess_findings(data['Report Text'].iloc[i]))
data['Caption'] = captions

data = data[data['Caption'].apply(len) < 400].reset_index(drop=True)

print('DCM Path Preprocessing\n===============')
dcm_paths = []
for i in tqdm.tqdm(range(len(data))):
    dcm_paths.append(preprocess_dcm_path(i))
data['DCM Path'] = dcm_paths
data = data[data['Caption'] != 'None']
data = data[data['DCM Path'] != 'None']

data = data.reset_index(drop=True)

Caption Preprocessing


100%|████████████████████████████████████| 3458/3458 [00:00<00:00, 25429.47it/s]


DCM Path Preprocessing


100%|███████████████████████████████████████| 2883/2883 [04:43<00:00, 10.17it/s]


In [7]:
data.to_csv('data/data_processed.csv', index=False)

## Train Test Split

In [10]:
train_val_dataset, test_dataset = train_test_split(data,test_size=0.2,train_size=0.8)
train_dataset, val_dataset = train_test_split(train_val_dataset,test_size=0.25,train_size=0.75)

In [11]:
train_val_dataset = train_val_dataset.reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

In [13]:
train_val_dataset.to_csv('data/train_val_dataset.csv',index=False)
train_dataset.to_csv('data/train_dataset.csv',index=False)
val_dataset.to_csv('data/val_dataset.csv',index=False)
test_dataset.to_csv('data/test_dataset.csv',index=False)