# Positive samples for the dataset

## Extracting relevant information from the dataset

In [1]:
import pandas as pd
import numpy as np
import pydicom
import png
import math
from PIL import Image
from pathlib import Path
import boto3
import matplotlib.pyplot as plt
from collections import Counter

data_path = '/home/szelesteya/projects/EMBED_Open_Data/'
tables_path = data_path + 'tables/'
image_root_path = '/media/szelesteya/F824D4D024D492CC/EMBED-images/'
image_dcm_path = image_root_path + 'dicom-positive/'
image_png_path = image_root_path + 'positive'

In [2]:
# Import Legends which describe clinical information's labeling
df_legends = pd.read_csv(tables_path + 'AWS_Open_Data_Clinical_Legend.csv')

# Keep relevant information that is about calcifications
df_calcs = df_legends[df_legends['Header in export'].isin(['calcfind'])][['Code',
                                                                          'Meaning']]

# Drop codes that are irrelevant
df_calcs_rel = df_calcs[~df_calcs['Code'].isin(['V','U','S','M'])]

# df_calcs_rel

In [3]:
cli_df = pd.read_csv(tables_path + 'EMBED_OpenData_clinical.csv', low_memory=False)

# Only keeping result BIRADS-1 and BIRADS-2 screenings
pos_cli_df = cli_df[((cli_df.calcfind.notna()) & 
                     (cli_df.calcfind.isin(df_calcs_rel['Code'])))][[  'Unnamed: 0',
                                                                       'empi_anon',
                                                                       'acc_anon',
                                                                       'side',
                                                                       'calcfind',
                                                                       'calcdistri',
                                                                       'otherfind',
                                                                       'numfind',
                                                                       'path_severity',
                                                                       'age_at_study',
                                                                       'ETHNICITY_DESC',
                                                                       'study_date_anon',
                                                                       'asses']]

# Rename columns to prepare for merge
pos_cli_df = pos_cli_df.rename(columns={'study_date_anon':'diag_study_date'})

In [4]:
# Reading image metadata
meta_df = (pd.read_csv(tables_path + 'EMBED_OpenData_metadata_reduced.csv', low_memory=False))
meta_red_df = meta_df[  (meta_df['ROI_coords'] != "()") &
                        (meta_df['FinalImageType'] == '2D') &                         
                        (meta_df['spot_mag'] != 1)][[  'empi_anon',
                                                                'acc_anon',
                                                                'ImageLateralityFinal',
                                                                'anon_dicom_path',
                                                                'study_date_anon',
                                                                'ViewPosition', 
                                                                'num_roi',
                                                                'ROI_coords',
                                                                'spot_mag']]

# Rename columns to prepare for merge
meta_red_ren_df = meta_red_df.rename(columns={'ImageLateralityFinal':'side'})

In [5]:
# Merging clinical information with medical ones
pos_full_df = pos_cli_df.merge(meta_red_ren_df, on=['empi_anon','acc_anon','side'])

# Generate paths for png extraction
pos_full_df['relative_dcm_path'] = pos_full_df['anon_dicom_path'].apply(lambda x: '/'.join(x.split('/')[5:]))

# Keeping relevant columns
pos_empi_df = pos_full_df[['empi_anon',
                           'acc_anon',
                           'side',
                           'asses',
                           'age_at_study',
                           'calcfind',
                           'calcdistri',
                           'otherfind',
                           'numfind',
                           'ViewPosition',
                           'num_roi',
                           'ROI_coords',
                           'ETHNICITY_DESC',
                           'study_date_anon',
                           'diag_study_date',
                           'relative_dcm_path',
                           'spot_mag']]

# Rename columns to be more consistent
pos_empi_df = pos_empi_df.rename(columns={'ETHNICITY_DESC':'eth_desc',
                                          'calcfind':'calc_find',
                                          'calcdistri':'calc_distrib',
                                          'otherfind':'other_find',                                          
                                          'ViewPosition':'view_pos',
                                          'numfind':'num_find'})


# Convert study date so it is easily interpreted by Python
pos_empi_df['diag_study_date'] = pd.to_datetime(pos_empi_df['diag_study_date'], errors='coerce', format= '%Y-%m-%d')
pos_empi_df['study_date_anon'] = pd.to_datetime(pos_empi_df['study_date_anon'], errors='coerce')

# Keeping only screening exams with less than 180 day differential between diagnosis date and last exam date (the diagnosis might be outdated on other circumstances)
pos_empi_df['diag_date_diff'] = pos_empi_df.diag_study_date - pos_empi_df.study_date_anon
pos_empi_rel_df= pos_empi_df.loc[(pos_empi_df.diag_date_diff.dt.days >= 0) & 
                                 (pos_empi_df.diag_date_diff.dt.days <= 180)]

In [6]:
bins = plt.hist(pos_empi_rel_df['calc_find'])
counts = Counter(pos_empi_rel_df['calc_find'])
# plt.text([(i + 0.5) for i in range(len(counts.values()))], str(counts.values()))
plt.savefig('calc_types')
plt.show()
counts

In [7]:
with open(data_path + 'positive_empirical.csv', 'w') as f:
    pos_empi_rel_df.to_csv(f)

## Pulling the DICOM images with a bash script

In [8]:
# Printing the path to file to read it in the bash script
with open(data_path + 'positive_path.csv', 'w') as f:
    pos_empi_rel_df.drop_duplicates(subset=['relative_dcm_path'])['relative_dcm_path'].to_csv(f, index=False)

In [9]:
%%bash -s "$image_dcm_path" "{data_path}positive_path.csv"

# Pulling dicom files with AWS CLI (Python API didn't work)
dcm_dest_path="$1"
dcm_paths="$2"
ind=$((1))

tail -n +2 $dcm_paths | while IFS= read -r line; do
    relative_path=$(echo "$line" | awk -v OFS='/' '{$1=$1; print}')
    dcm_name=$(echo "$relative_path" | cut -d '/' -f 3-)
               
    file="${dcm_dest_path}$relative_path"
    dir=$(dirname $file)
    mkdir $dir -p
    echo "$ind / 146"
                    
    if [ -f "$file" ]; then
        echo "File already present"
    else
        if [ -f "${dcm_dest_path}$dcm_name" ]; then
            dcm="${dcm_dest_path}$dcm_name"
            dir_dcm=$(dirname $dcm)
            echo "Moving file from ${dir_dcm}"
            mv "${dcm}" "${dcm_dest_path}$relative_path"
        else
            echo "Pulling file $file"
            aws s3 cp "s3://embed-dataset-open/images/$relative_path" "${file}" --profile my-dev-profile
        fi
    fi
    

    ind=$((ind+1))
    clear
done

### Converting the DICOM images to PNG

In [8]:
# Rescale the intensity of the image to get heterogene images with the bit depth of 14
def rescale_to_8bit(image_array):
    upper_percentile = np.percentile(image_array.flatten(), 98) # original_max = np.max(image_array)
    lower_percentile = np.percentile(image_array.flatten(), 2) # original_min = np.min(image_array)
    # max_on_14bit = 16383
    max = 255
    rescaled_array = (image_array - lower_percentile) / (upper_percentile - lower_percentile)
    rescaled_array[rescaled_array < 0] = 0
    rescaled_array[rescaled_array > 1] = 1
    # rescaled_array = np.round((image_array - original_min) / (original_max - original_min) * max_on_14bit).astype(int)
    return np.round(rescaled_array * 255).astype(np.uint8)

def generate_png_path(dcm_path):
    # Get new file name
    split_fn = dcm_path[:-4].split('/')
    new_fn = f"{split_fn[-1]}_conv.png"
    return image_path + new_fn

# Save DICOM pixel array as PNG
def save_dcm_image_as_png(image, png_filename, bitdepth=8):
    with open(png_filename, 'wb') as f:
        rescaled = rescale_to_8bit(image)
        writer = png.Writer(height=rescaled.shape[0], 
                            width=rescaled.shape[1], 
                            bitdepth=bitdepth, 
                            greyscale=True)
        writer.write(f, rescaled.tolist())

def generate_png_path(acc_anon, png_dir):
    # Get new file name
    new_fn = f"{acc_anon}_neg_conv.png"
    return f'{png_dir}/{new_fn}'

# Convert list of DICOMs to PNGs
def process_dcm_list(dcm_list, png_list):    
    for i, dcm_path in enumerate(dcm_list):    
        if not Path(png_list[i]).exists():
            print(f"Processing DICOM #{i}...")
            
            # Load DICOM
            dcm = pydicom.dcmread(dcm_path)
            img = dcm.pixel_array
            
            # Save PNG            
            save_dcm_image_as_png(img, png_list[i])

def extract_images(data_file_name, dcm_dir, png_dir):
    # Provide a list of DICOM paths and a target directory
    dcm_list = []
    df = pd.read_csv(data_file_name)
    
    for index, row in df.iterrows():
        path = dcm_dir  + row['relative_dcm_path']
        if Path(path).exists():
            dcm_list.append(path)
        
    # Insert png path
    df.loc[:,'png_path'] = df['acc_anon'].apply(lambda x: generate_png_path(x, png_dir))

    # Convert DICOMs
    process_dcm_list(dcm_list, df['png_path'])

    return df

In [9]:
pos_img_emp = extract_images(data_path + 'positive_empirical.csv', image_dcm_path, image_png_path)

with open(data_path + 'positive_empirical_png.csv', 'w') as f:
    (pos_img_emp).to_csv(f, index=False)