In [3]:
!pip install pydicom
!pip install beautifulsoup4
!pip install lxml

Collecting pydicom
  Downloading pydicom-2.3.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Collecting lxml
  Downloading lxml-4.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import pydicom
import cv2
import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import display
import pandas as pd
import os
import numpy as np
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json

In [14]:
def normalize_image(img_array):
    norm_img_array = (img_array - img_array.min()) / \
        (img_array.max() - img_array.min()) * 255
    return norm_img_array

# CBIS-DDSM

Import CBIS-DDSM dataset.
There is two directories:
- Folders with \<Mass\|Calc\>_\<Training\|Test\>_P_\<Patient_id\>_\<Left\|Right\>_\<CC\|MLO\>\(_\<N_Tumor\>\)
    - Folders named with mammography_id:
        - Folder type mammography 1.000000-\<ROI\|cropped\|full\>:
            - File

## Generate images png

In [39]:
ROOT_DIR_DDSM="/tf/data/CBIS-DDSM"
OUTPUT_DIR_DDSM="/tf/data/CBIS-DDSM/png_images"

os.makedirs(OUTPUT_DIR_DDSM, exist_ok=True)

count=0
for (path, _, files) in os.walk(ROOT_DIR_DDSM, topdown=False):
    for file in files:
        if file.endswith(".dcm"):
            file_path = os.path.join(path, file)
            image = pydicom.dcmread(file_path).pixel_array
            norm_img=normalize_image(image)
            patient_id = file_path.split("/")[4]

            # If the word "full" is in the path, it means that it's the complete mammography
            if "full" in file_path:
                new_name = patient_id + "_FULL" + ".png"
            else:
                num_colors = len(np.unique(image).tolist())
                if num_colors == 2:
                    new_name = patient_id + "_MASK" + ".png"
                else:
                    continue
            cv2.imwrite(os.path.join(OUTPUT_DIR_DDSM, new_name),norm_img) 
            count+=1
            print(count,flush=True,end="\r")       

6671

## Generate ROI data

In [86]:
def get_roi_area(img_path):
    img = cv2.imread(img_path)
    
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    contours, _ = cv2.findContours(
        gray, cv2.RETR_EXTERNAL,  cv2.CHAIN_APPROX_SIMPLE)
    
    max_contour = max(contours, key=cv2.contourArea)
    
    # x, y, w, h
    rect = cv2.boundingRect(max_contour)
    
    return rect

def find_nth_last(full_string, sub_string, n):
    start = full_string.rfind(sub_string)
    while start >= 0 and n > 0:
        start = full_string.rfind(sub_string, 0, start)
        n -= 1
    return start

def generate_key_and_type(filename):
    return file[:find_nth_last(file,"_",1)], file.split("-")[0]

def generate_info_dict(type_abnormality, roi):
    return {"type": type_abnormality, "x":roi[0], "y":roi[1], "w":roi[2], "h":roi[3]}


In [87]:
from collections import defaultdict
OUTPUT_DDSM_JSON="/tf/data/CBIS-DDSM/"

rois_coordinates=defaultdict(lambda: [])

for (path, _, files) in os.walk(OUTPUT_DIR_DDSM, topdown=False):
    for i, file in enumerate(files):
        if "MASK" in file:
            file_path = os.path.join(path, file)
            key, type_abnormality = generate_key_and_type(file)
            roi=get_roi_area(file_path)
            
            info = generate_info_dict(type_abnormality, roi)
            
            rois_coordinates[key].append(info)
        print(i,flush=True,end="\r")

with open(os.path.join(OUTPUT_DDSM_JSON, "roi_images.json"), 'w') as fp:
    json.dump(rois_coordinates, fp)

6670

# MIAS DATASET
Import MIAS dataset.

It contains:
- <b>Info.txt</b>: contains info about each photo:
    - photo reference
    - type of bacground tissue
        - F: Fatty
        - G: Fatty-glandular
        - D: Dense-glandular)
    - Type of abnormality present 
        - CALC: Calcification
        - CIRC: Well-defined/circumscribed masses
        - SPIC: Spiculated masses
        - MISC: Other, ill-defined masses
        - ARCH: Architectural distortion
        - ASYM: Asymmetry
        - NORM: Normal
    - Severity of abnormality
        - B: Benign
        - M: Malignant
    - X Coordinate of center of abnomality
    - Y Coordinate of center of abnomality
    - Radius size (in pixels) of abnormality
    
    
- <b>all-mias</b>: contains all images in .pgm format

## Generate images png

In [17]:
ROOT_DIR_MIAS="/tf/data/MIAS/all-mias"
OUTPUT_DIR_MIAS="/tf/data/MIAS/all-mias-png"

os.makedirs(OUTPUT_DIR_MIAS, exist_ok=True)

for file in os.listdir(ROOT_DIR_MIAS):
    if file.endswith(".pgm"):
        img = cv2.imread(os.path.join(ROOT_DIR_MIAS,file),-1)
        norm_img=normalize_image(img)
        cv2.imwrite(os.path.join(OUTPUT_DIR_MIAS, file)[:-4]+".png",norm_img)

## Generate ROI data

In [96]:
def generate_roi_from_circle(x_center, y_center, diam):
    return [x_center-diam/2, y_center-diam/2, diam, diam]

In [102]:
import csv

INPUT_MIAS_INFO="/tf/data/MIAS/Info.txt"
OUTPUT_MIAS_JSON="/tf/data/MIAS"

rois_coordinates=defaultdict(lambda: [])

with open(INPUT_MIAS_INFO, 'r') as csvfile:
    csv_container = csv.reader(csvfile, delimiter=' ')
    header = next(csv_container)
    for i, row in enumerate(csv_container):         
        type_abnormality=row[2]
        if type_abnormality != "NORM" and len(row)>5:
            x_center=float(row[-3])
            y_center=float(row[-2])
            diam=float(row[-1])
            
            roi=generate_roi_from_circle(x_center,y_center,diam)
            info = generate_info_dict(type_abnormality, roi)
            key=row[0]
            rois_coordinates[key].append(info)
            
with open(os.path.join(OUTPUT_MIAS_JSON, "roi_images.json"), 'w') as fp:
    json.dump(rois_coordinates, fp)

# INBreast

INBreast contains multiple folders and files:
- AllDICOMs: dicom files (the name of the file is \<mammography_id\>_\<patient_id\>_MG_\<laterality\>_\<type_of_view\>_ANON where laterality could be "R" or "L" and type_of_view "CC" or "ML"
- AllROI: ROIs of different dicoms in .roi format (format form Osirix, not useful for code)
- AllXML: the same info as in the previous field of the ROI but in xml format
- MedicalReport: anotations of the patients done by the expert (the name of the file is the patient id)
- PectoralMuscle: contains two folders with a similar xml structure for the boudaries of the pectoral muscle
- inbreast.pdf: pdf of the INbreast article
- INbreast.csv and INbreast.xls: summary of the scans done.


## Generate images png

In [4]:
ROOT_DIR_INBREAST="/tf/data/INbreast/AllDICOMs"
OUTPUT_DIR_INBREAST="/tf/data/INbreast/AllPNGs"

os.makedirs(OUTPUT_DIR_INBREAST, exist_ok=True)

inbreast_images = []
for file in os.listdir(ROOT_DIR_INBREAST):
    if file.endswith(".dcm"):
        img = pydicom.dcmread(os.path.join(ROOT_DIR_INBREAST,file)).pixel_array
        norm_img=normalize_image(img)
        cv2.imwrite(os.path.join(OUTPUT_DIR_INBREAST, file)[:-4]+".png",norm_img)

## Generate ROI data

In [282]:
def gen_dict(soup):
    if soup.name is None:
        return
    #check if iterable
    if soup.name == "integer":
        return int(soup.text)
    if soup.name == "real":
        return float(soup.text)
    if soup.name == "string":
        if soup.text.startswith("("):
            points={}
            coords = ["x","y","z"]
            for i, point in enumerate(soup.text[1:-1].split(",")):
                points[coords[i]]=point
            return points
        return soup.text                
    if soup.name == "array":
        return list(filter(lambda x: x is not None, [gen_dict(el) for el in soup]))
    if soup.name == "dict":
        main_dict = {}
        for el in soup: 
            if el.name == "key":
                next_sibling = None
                while next_sibling is None:
                    next_sibling = el.find_next_sibling()
                main_dict[el.text]=gen_dict(next_sibling)
        return main_dict

def xml_to_dict(soup):
    return gen_dict(soup.plist.dict.array.dict)

In [284]:
ROOT_INBREAST_XML="/tf/data/INbreast/AllXML"
OUTPUT_INBREAST_JSON="/tf/data/INbreast"

info_images=[]
for file in os.listdir(ROOT_INBREAST_XML):
    if file.endswith(".xml"):
        with open(os.path.join(ROOT_INBREAST_XML,file), 'r') as f:
            data = f.read()
        soup = BeautifulSoup(data, 'xml') 
        info_dict = xml_to_dict(soup)
        info_dict["id"]=file[0:-4]
        info_images.append(info_dict)
        
with open(os.path.join(OUTPUT_INBREAST_JSON, "all_images_info.json"), 'w') as fp:
    json.dump(info_images, fp)


Some of the ROIs are to small (Area is 0.0, which means it's so small, and only has one point to mark the abnormality). A default zone is created, centered on the provided coordinates, with a size of 5x5 pixels in the final json file.

In [114]:
with open(os.path.join(OUTPUT_INBREAST_JSON, "all_images_info.json"), 'r') as fp:
    data=json.load(fp)
data[0]

{'ImageIndex': 0,
 'NumberOfROIs': 6,
 'ROIs': [{'Area': 0.0,
   'Center': {'x': '0.000000', 'y': ' 0.000000', 'z': ' 0.000000'},
   'Dev': 0.0,
   'IndexInImage': 0,
   'Max': 1619.0,
   'Mean': 1619.0,
   'Min': 1619.0,
   'Name': 'Calcification',
   'NumberOfPoints': 1,
   'Point_mm': [{'x': '0.000000', 'y': ' 0.000000', 'z': ' 0.000000'}],
   'Point_px': [{'x': '132.378006', 'y': ' 2200.719971'}],
   'Total': 1619.0,
   'Type': 19},
  {'Area': 0.003521109465509653,
   'Center': {'x': '0.000000', 'y': ' 0.000000', 'z': ' 0.000000'},
   'Dev': 53.59735870361328,
   'IndexInImage': 1,
   'Max': 1425.0,
   'Mean': 1311.7840576171875,
   'Min': 1172.0,
   'Name': 'Calcification',
   'NumberOfPoints': 5,
   'Point_mm': [{'x': '0.000000', 'y': ' 0.000000', 'z': ' 0.000000'},
    {'x': '0.000000', 'y': ' 0.000000', 'z': ' 0.000000'},
    {'x': '0.000000', 'y': ' 0.000000', 'z': ' 0.000000'},
    {'x': '0.000000', 'y': ' 0.000000', 'z': ' 0.000000'},
    {'x': '0.000000', 'y': ' 0.000000', 

In [122]:
def generate_roi_from_coordinates_list(coords):
    min_x = np.min([float(i["x"]) for i in coords])
    max_x = np.max([float(i["x"]) for i in coords])
    min_y = np.min([float(i["y"]) for i in coords])
    max_y = np.max([float(i["y"]) for i in coords])
    w = int(max_x - min_x)
    h = int(max_y - min_y)
    w = 5 if w==0 else w
    h = 5 if h==0 else h
    return int(min_x), int(min_y), w, h

In [123]:
rois_coordinates=defaultdict(lambda: [])

with open(os.path.join(OUTPUT_INBREAST_JSON, "all_images_info.json"), 'r') as fp:
    data=json.load(fp)
    for mammography_info in data:
        for roi in mammography_info["ROIs"]:
            roi_rectangle = generate_roi_from_coordinates_list(roi["Point_px"])
            info = generate_info_dict(roi["Name"], roi_rectangle)
            rois_coordinates[mammography_info["id"]].append(info)
        
with open(os.path.join(OUTPUT_INBREAST_JSON, "roi_images.json"), 'w') as fp:
    json.dump(rois_coordinates, fp)         

Check the types of abnormalities

In [135]:
types_ab = defaultdict(int)
for k, rois in rois_coordinates.items():
    for roi in rois:
        types_ab[roi["type"]] += 1
types_ab

defaultdict(int,
            {'Calcification': 7142,
             'Spiculated Region': 14,
             'Mass': 116,
             'Cluster': 27,
             'Assymetry': 1,
             'Asymmetry': 5,
             'Distortion': 3,
             'Calcifications': 1,
             'Unnamed': 2,
             'Point 3': 1,
             'Point 1': 1,
             'Spiculated region': 1,
             '': 1,
             'Espiculated Region': 1})

We can get some examples of each abnormality in the photos with its coordinates

In [152]:
for type_ab in types_ab.keys():
    for k, rois in rois_coordinates.items():
        for roi in rois:
            if roi["type"] == type_ab:
                print(f"{type_ab} found in image {k} in coords {roi['x']}, {roi['y']}, and size {roi['w']}x{roi['h']}")
                break
        else:
            continue
        break
                

Calcification found in image 50998413 in coords 132, 2200, and size 5x5
Spiculated Region found in image 50998981 in coords 1975, 872, and size 378x550
Mass found in image 50998981 in coords 2070, 1112, and size 190x193
Cluster found in image 22579916 in coords 745, 1568, and size 164x166
Assymetry found in image 24065461 in coords 2175, 1825, and size 1045x994
Asymmetry found in image 22580576 in coords 308, 890, and size 712x696
Distortion found in image 50998634 in coords 204, 1316, and size 508x452
Calcifications found in image 51070197 in coords 2325, 1959, and size 71x49
Unnamed found in image 20587174 in coords 277, 1794, and size 12x13
Point 3 found in image 22613822 in coords 2792, 368, and size 5x5
Point 1 found in image 22580341 in coords 2301, 2063, and size 5x5
Spiculated region found in image 22670147 in coords 2340, 1093, and size 600x474
 found in image 22670511 in coords 569, 727, and size 5x5
Espiculated Region found in image 24055355 in coords 447, 1257, and size 767