<a href="https://colab.research.google.com/github/deepakri201/NLSTNatureSciData/blob/main/TechnicalValidation/consistencyChecks/NLSTSegvsNLSTSybil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Compare NLSTSeg lesion segmentations to the NLST Sybil bounding boxes

In this notebook, we compare the lesion segmentations from NLSTSEg to the Sybil bounding boxes.


Deepa Krishnaswamy

Brigham and Women's Hospital

December 2025


# Parameterization

In [1]:
#@title Enter your Project ID here
# initialize this variable with your Google Cloud Project ID!
project_name = "idc-external-018" #@param {type:"string"}

import os
os.environ["GCP_PROJECT_ID"] = project_name

!gcloud config set project $project_name

from google.colab import auth
auth.authenticate_user()

Updated property [core/project].


# Environment setup

In [10]:
!pip install highdicom

Collecting highdicom
  Downloading highdicom-0.27.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pyjpegls>=1.0.0 (from highdicom)
  Downloading pyjpegls-1.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Downloading highdicom-0.27.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyjpegls-1.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyjpegls, highdicom
Successfully installed highdicom-0.27.0 pyjpegls-1.5.1


In [2]:
!pip install idc-index

Collecting idc-index
  Downloading idc_index-0.11.1-py3-none-any.whl.metadata (7.8 kB)
Collecting duckdb<=1.2.1,>=0.10.0 (from idc-index)
  Downloading duckdb-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (966 bytes)
Collecting idc-index-data==23.0.1 (from idc-index)
  Downloading idc_index_data-23.0.1-py3-none-any.whl.metadata (5.4 kB)
Collecting s5cmd>=0.3.2 (from idc-index)
  Downloading s5cmd-0.3.3-py3-none-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (7.0 kB)
Collecting sphinx-click (from idc-index)
  Downloading sphinx_click-6.2.0-py3-none-any.whl.metadata (3.5 kB)
Downloading idc_index-0.11.1-py3-none-any.whl (28 kB)
Downloading idc_index_data-23.0.1-py3-none-any.whl (85.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading duckdb-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install pydicom

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1


In [4]:
import os
import sys
import time
import shutil

import numpy as np
import pandas as pd
import nibabel as nib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

import json
from pathlib import Path
import datetime

from google.cloud import bigquery
from google.cloud import storage

from idc_index import IDCClient
idc_client = IDCClient.client()

import pydicom

In [11]:
import highdicom as hd

In [12]:
from pydicom.sr.codedict import codes

# Functions

In [28]:
# Single mask/bbox
def tumor_fraction_in_box(mask, bbox):
  """
  mask: 2D numpy array (0/1 or bool) for tumor
  bbox: (xmin, ymin, xmax, ymax) in pixel coordinates
        (x = cols, y = rows)
  Returns: fraction of tumor voxels inside the box
  """
  xmin, ymin, xmax, ymax = bbox

  # clip to image bounds
  ymin = max(0, ymin); ymax = min(mask.shape[0], ymax)
  xmin = max(0, xmin); xmax = min(mask.shape[1], xmax)

  # box mask
  box_mask = np.zeros_like(mask, dtype=bool)
  box_mask[ymin:ymax, xmin:xmax] = True

  # intersection
  tumor_inside = np.logical_and(mask.astype(bool), box_mask)
  fraction = tumor_inside.sum() / mask.sum() if mask.sum() > 0 else np.nan

  return fraction

# List of masks/bboxes
def tumor_fraction_in_boxes(masks, bboxes):
    """
    Compute overall fraction of tumor voxels inside all boxes.

    Parameters
    ----------
    masks : list of 2D numpy arrays (bool or 0/1)
        Each mask corresponds to one slice.
    bboxes : list of tuples (xmin, ymin, xmax, ymax)
        Each bbox corresponds to the same slice index in masks.

    Returns
    -------
    fraction : float
        Overall fraction of tumor voxels inside all boxes across slices.
        Returns np.nan if no tumor voxels are present at all.
    """
    total_tumor_voxels = 0
    total_tumor_inside = 0

    for mask, bbox in zip(masks, bboxes):
        if mask is None or bbox is None:
            continue

        xmin, ymin, xmax, ymax = bbox

        # Clip to image bounds
        ymin = max(0, ymin); ymax = min(mask.shape[0], ymax)
        xmin = max(0, xmin); xmax = min(mask.shape[1], xmax)

        if ymin >= ymax or xmin >= xmax:
            continue  # skip degenerate boxes

        # Create box mask and compute intersection
        box_mask = np.zeros_like(mask, dtype=bool)
        box_mask[ymin:ymax, xmin:xmax] = True

        tumor_inside = np.logical_and(mask.astype(bool), box_mask)
        total_tumor_inside += tumor_inside.sum()
        total_tumor_voxels += mask.sum()

    if total_tumor_voxels == 0:
        return np.nan

    return total_tumor_inside / total_tumor_voxels



In [29]:
def read_seg(seg_filename):

  seg = pydicom.dcmread(seg_filename)
  SeriesInstanceUID_seg = seg.SeriesInstanceUID
  # Get the list of SegmentNumber and SegmentLabel
  segment_number_original_list = [f.SegmentNumber for f in seg.SegmentSequence]
  segment_label_original_list = [f.SegmentLabel for f in seg.SegmentSequence]
  # Get the pixel array data
  mask_data = seg.pixel_array
  # Need to get the per frame function groups - to know the referencedSOPInstanceUIDs
  pffg = seg.PerFrameFunctionalGroupsSequence
  num_slices_in_seg = len(pffg)
  print('num_slices_in_seg: ' + str(num_slices_in_seg))
  pffg_sop_list = []
  segment_number_list = []
  segment_label_list = []
  error_der_image_sequence = 0
  for n in range(0,num_slices_in_seg):
    try:
      sop = pffg[n].DerivationImageSequence[0].SourceImageSequence[0].ReferencedSOPInstanceUID
      segment_number = pffg[n].SegmentIdentificationSequence[0].ReferencedSegmentNumber
      # get the index of where the segment_number is in the segment_number_original_list
      segment_number_index = segment_number_original_list.index(segment_number)
      segment_label = segment_label_original_list[segment_number_index]
    except:
      print('ERROR: cannot access derivation image sequence')
      error_der_image_sequence = -1
      continue
    pffg_sop_list.append(sop)
    segment_number_list.append(segment_number)
    segment_label_list.append(segment_label)
  # Now create a dataframe with the SOPInstanceUID and the slice index into mask
  df_seg = pd.DataFrame()
  df_seg['SOPInstanceUID'] = pffg_sop_list
  df_seg['mask_slice'] = np.arange(0,len(pffg_sop_list))
  df_seg['SegmentNumber'] = segment_number_list
  df_seg['SegmentLabel'] = segment_label_list
  df_seg['num_of_slices_with_seg'] = [num_slices_in_seg] * len(pffg_sop_list)

  return df_seg, mask_data

In [30]:
def read_sr_using_highdicom(NLSTSybil_filename):

  sr = hd.sr.srread(NLSTSybil_filename)

  # Dataframe to hold the SR info
  df_sr = pd.DataFrame()

  # Get various UIDs
  PatientID = sr.PatientID
  StudyInstanceUID = sr.StudyInstanceUID
  SeriesInstanceUID = sr.SeriesInstanceUID
  referenced_series_instance_uid = sr.CurrentRequestedProcedureEvidenceSequence[0].ReferencedSeriesSequence[0].SeriesInstanceUID

  # Get the image_region_code
  image_region_code = codes.DCM.ImageRegion

  # First get the planar roi measurement gorups
  groups = sr.content.get_planar_roi_measurement_groups()

  # To store the lists for the dataframe
  tracking_identifier_list = []
  tracking_uid_list = []
  referenced_sop_instance_uid_list = []
  finding_type_list = []
  finding_site_list = []
  bbox_list = []
  x0_list = []; y0_list = []
  x1_list = []; y1_list = []
  x2_list = []; y2_list = []
  x3_list = []; y3_list = []

  for group in groups:

    # Get the tracking ids
    tracking_identifier = group.tracking_identifier
    tracking_uid = group.tracking_uid

    # Get the findings and finding_sites
    finding_type = [group.finding_type.CodeValue, group.finding_type.CodingSchemeDesignator, group.finding_type.CodeMeaning]
    finding_sites = []
    for finding_site in group.finding_sites:
      finding_sites.append([finding_site.value.CodeValue,
                            finding_site.value.CodingSchemeDesignator,
                            finding_site.value.CodeMeaning])
    if (len(finding_sites)==1):
      finding_sites = finding_sites[0]

    # Get the Image Region
    referenced_sop_instance_uid = group.roi.ContentSequence[0].referenced_sop_instance_uid
    bbox = group.roi.value

    # Get the individual points
    x0 = bbox[0,0]
    y0 = bbox[0,1]
    x1 = bbox[1,0]
    y1 = bbox[1,1]
    x2 = bbox[2,0]
    y2 = bbox[2,1]
    x3 = bbox[3,0]
    y3 = bbox[3,1]

    # Calculate the width, height and center, as these are needed for display
    min_x = np.min([bbox[0,0], bbox[1,0], bbox[2,0], bbox[3,0]]) # using roi.GraphicData: min_x = np.min([bbox[0], bbox[2], bbox[4], bbox[6]])
    max_x = np.max([bbox[0,0], bbox[1,0], bbox[2,0], bbox[3,0]]) # using roi.GraphicData: max_x = np.max([bbox[0], bbox[2], bbox[4], bbox[6]])
    min_y = np.min([bbox[0,1], bbox[1,1], bbox[2,1], bbox[3,1]]) # using roi.GraphicData: min_y = np.min([bbox[1], bbox[3], bbox[5], bbox[7]])
    max_y = np.max([bbox[0,1], bbox[1,1], bbox[2,1], bbox[3,1]]) # using roi.GraphicData: max_y = np.max([bbox[1], bbox[3], bbox[5], bbox[7]])
    width = max_x - min_x
    height = max_y - min_y
    # in pixel coordinates
    center_x = min_x + width/2
    center_y = min_y + height/2

    # Append to list
    tracking_identifier_list.append(tracking_identifier)
    tracking_uid_list.append(tracking_uid)
    referenced_sop_instance_uid_list.append(referenced_sop_instance_uid)
    finding_type_list.append(finding_type)
    finding_site_list.append(finding_sites)
    bbox_list.append(bbox)
    x0_list.append(x0); y0_list.append(y0)
    x1_list.append(x1); y1_list.append(y1)
    x2_list.append(x2); y2_list.append(y2)
    x3_list.append(x3); y3_list.append(y3)


  # Create dataframe
  df_sr['PatientID'] = [PatientID] * len(tracking_identifier_list)
  df_sr['StudyInstanceUID'] = [StudyInstanceUID] * len(tracking_identifier_list)
  df_sr['SeriesInstanceUID'] = [SeriesInstanceUID] * len(tracking_identifier_list)
  df_sr['ReferencedSeriesInstanceUID'] = [referenced_series_instance_uid] * len(tracking_identifier_list)
  df_sr['TrackingIdentifier'] = tracking_identifier_list
  df_sr['TrackingUID'] = tracking_uid_list
  df_sr['ReferencedSOPInstanceUID'] = referenced_sop_instance_uid_list
  df_sr['FindingType'] = finding_type_list
  df_sr['FindingSite'] = finding_site_list
  df_sr['x0'] = x0_list
  df_sr['y0'] = y0_list
  df_sr['x1'] = x1_list
  df_sr['y1'] = y1_list
  df_sr['x2'] = x2_list
  df_sr['y2'] = y2_list
  df_sr['x3'] = x3_list
  df_sr['y3'] = y3_list

  return df_sr

# Find the series that have both a bounding box and a segmentation

In [6]:
client_bq = bigquery.Client(project=project_name)

query = f"""
      WITH nlstseg AS (
        SELECT DISTINCT
          PatientID,
          StudyInstanceUID,
          SeriesInstanceUID,
          ReferencedSeriesSequence[SAFE_OFFSET(0)].SeriesInstanceUID AS ReferencedSeriesInstanceUID,
          series_gcs_url
        FROM
          `bigquery-public-data.idc_v23.dicom_all`
        WHERE
          analysis_result_id = 'NLSTSeg' AND
          Modality = 'SEG'
      )
      SELECT DISTINCT
        dicom_all.PatientID,
        dicom_all.StudyInstanceUID,
        dicom_all.SeriesInstanceUID AS NLSTSybil_SeriesInstanceUID,
        nlstseg.SeriesInstanceUID AS NLSTSeg_SeriesInstanceUID,
        nlstseg.ReferencedSeriesInstanceUID AS CT_SeriesInstanceUID,
        dicom_all.series_gcs_url AS NLSTSybil_series_gcs_url,
        nlstseg.series_gcs_url AS NLSTSeg_series_gcs_url
      FROM
        `bigquery-public-data.idc_v23.dicom_all` as dicom_all
      JOIN
        nlstseg
      ON
        nlstseg.ReferencedSeriesInstanceUID = dicom_all.CurrentRequestedProcedureEvidenceSequence[OFFSET(0)].ReferencedSeriesSequence[OFFSET(0)].SeriesInstanceUID
      WHERE
        analysis_result_id = 'NLST-Sybil' AND
        Modality = 'SR'
      ORDER BY
        dicom_all.PatientID,
        dicom_all.StudyInstanceUID
      """
df_overlap = client_bq.query(query).to_dataframe()

NLSTSeg_SeriesInstanceUID_list = list(df_overlap['NLSTSeg_SeriesInstanceUID'].values)
NLSTSybil_SeriesInstanceUID_list = list(df_overlap['NLSTSybil_SeriesInstanceUID'].values)
SeriesInstanceUIDs_list = list(df_overlap['CT_SeriesInstanceUID'].values)
print('Number of intersecting series: ' + str(len(SeriesInstanceUIDs_list)))

Number of intersecting series: 402


In [7]:
df_overlap.head()

Unnamed: 0,PatientID,StudyInstanceUID,NLSTSybil_SeriesInstanceUID,NLSTSeg_SeriesInstanceUID,CT_SeriesInstanceUID,NLSTSybil_series_gcs_url,NLSTSeg_series_gcs_url
0,100147,1.2.840.113654.2.55.31958452963320032523273261...,1.2.826.0.1.3680043.8.498.80137012981695130585...,1.2.276.0.7230010.3.1.3.481037312.9241.1761239...,1.2.840.113654.2.55.15708941008648745210499888...,gs://idc-open-data/c98be92a-3a5d-42f6-b633-83d...,gs://idc-open-data/1b99cce0-9836-460d-9a9a-4a4...
1,100158,1.2.840.113654.2.55.81185422866512279860334872...,1.2.826.0.1.3680043.8.498.39381921446583412683...,1.2.276.0.7230010.3.1.3.481037312.10206.176123...,1.2.840.113654.2.55.31060976780967844152296392...,gs://idc-open-data/445ea7e7-fc77-40e2-a293-413...,gs://idc-open-data/ac86b50d-429e-4b80-b63d-9f9...
2,100242,1.2.840.113654.2.55.22835224307907880875083018...,1.2.826.0.1.3680043.8.498.53651851111844704842...,1.2.276.0.7230010.3.1.3.481037312.11172.176123...,1.2.840.113654.2.55.38995485391900019876570761...,gs://idc-open-data/e51784f9-3b95-4cd3-8bed-f5a...,gs://idc-open-data/0e95deee-ab56-4ec3-bb62-b58...
3,100518,1.2.840.113654.2.55.36142141360017014809389003...,1.2.826.0.1.3680043.8.498.11954916967745944752...,1.2.276.0.7230010.3.1.3.481037312.13127.176123...,1.2.840.113654.2.55.81136962133262551156371928...,gs://idc-open-data/1411b24e-10fb-46cc-9951-b2f...,gs://idc-open-data/f06e3e38-e254-4405-9bad-9ef...
4,100570,1.2.840.113654.2.55.88862626250387223376057356...,1.2.826.0.1.3680043.8.498.60085925332680300418...,1.2.276.0.7230010.3.1.3.481037312.14098.176123...,1.2.840.113654.2.55.32380467633296345717423514...,gs://idc-open-data/723229c3-330d-40cf-8c33-b7b...,gs://idc-open-data/a65b4661-d206-4fc2-bc19-a75...


# Download the files

In [8]:
# First download the NLSTSeg SEG files
idc_client.download_from_selection(
    seriesInstanceUID=NLSTSeg_SeriesInstanceUID_list,
    downloadDir="/content/nlstseg/",
    dirTemplate="%SeriesInstanceUID",
    quiet=True
)

# Then download the NLSTSybil SR files
idc_client.download_from_selection(
    seriesInstanceUID=NLSTSybil_SeriesInstanceUID_list,
    downloadDir="/content/nlstsybil/",
    dirTemplate="%SeriesInstanceUID",
    quiet=True
)

Downloading data: 100%|██████████| 171M/171M [00:02<00:00, 78.3MB/s]
Downloading data: 100%|██████████| 8.29M/8.29M [00:02<00:00, 3.91MB/s]


In [9]:
# Rename the files according to the CT_SeriesInstanceUID
# Using the gcs_urls from df_overlap above

if not os.path.isdir("/content/nlstseg_renamed"):
  os.makedirs("/content/nlstseg_renamed", exist_ok=True)
if not os.path.isdir("/content/nlstsybil_renamed"):
  os.makedirs("/content/nlstsybil_renamed", exist_ok=True)

num_files = len(df_overlap)
for n in range(0,num_files):
  # NLSTSeg
  temp_dir = os.path.join("/content/nlstseg", df_overlap['NLSTSeg_SeriesInstanceUID'].values[n])
  temp_file = os.listdir(temp_dir)[0]
  src_filename = os.path.join("/content/nlstseg", df_overlap['NLSTSeg_SeriesInstanceUID'].values[n], temp_file)
  dst_filename = os.path.join("/content/nlstseg_renamed", df_overlap['CT_SeriesInstanceUID'].values[n] + ".dcm")
  shutil.copy2(src_filename, dst_filename)
  # NLSTSybil
  temp_dir = os.path.join("/content/nlstsybil", df_overlap['NLSTSybil_SeriesInstanceUID'].values[n])
  temp_file = os.listdir(temp_dir)[0]
  src_filename = os.path.join("/content/nlstsybil", df_overlap['NLSTSybil_SeriesInstanceUID'].values[n], temp_file)
  dst_filename = os.path.join("/content/nlstsybil_renamed", df_overlap['CT_SeriesInstanceUID'].values[n] + ".dcm")
  shutil.copy2(src_filename, dst_filename)

# Compute the overlap and save to a csv file

In [33]:
### Calculate the overlap on a per series basis ###

df_overlap_all = pd.DataFrame()
print('Num series: ' + str(len(SeriesInstanceUIDs_list)))

### Over each series ###
for index, series in enumerate(SeriesInstanceUIDs_list,1):

  print('*** On index: ' + str(index) + ' ***')

  # Get the SeriesInstanceUID of the seg and bbox
  SeriesInstanceUID_seg = df_overlap[df_overlap['CT_SeriesInstanceUID']==series]['NLSTSeg_SeriesInstanceUID']
  SeriesInstanceUID_bbox = df_overlap[df_overlap['CT_SeriesInstanceUID']==series]['NLSTSybil_SeriesInstanceUID']

  # Get the SEG file
  seg_filename = os.path.join("/content/nlstseg_renamed", series + ".dcm")
  # Get the SR file
  sr_filename = os.path.join("/content/nlstsybil_renamed", series + ".dcm")

  # Read the SEG file
  df_seg, mask_data = read_seg(seg_filename)
  # Read the SR file
  df_sr = read_sr_using_highdicom(sr_filename)

  # Keep only the ones that overlap
  df_seg_and_sr = pd.merge(df_seg, df_sr, left_on='SOPInstanceUID', right_on='ReferencedSOPInstanceUID', how='inner')

  # Calculate the overlap
  SOPInstanceUID_list = df_seg_and_sr['SOPInstanceUID'].values
  mask_slice_list = df_seg_and_sr['mask_slice'].values
  mask_list = []
  bbox_list = []
  num_of_slices_with_bbox_list = []

  for SOPInstanceUID,mask_slice in zip(SOPInstanceUID_list,mask_slice_list):
    df_temp = df_seg_and_sr[df_seg_and_sr['ReferencedSOPInstanceUID']==SOPInstanceUID]
    x0 = df_temp['x0'].values[0]; y0 = df_temp['y0'].values[0]
    x1 = df_temp['x1'].values[0]; y1 = df_temp['y1'].values[0]
    x2 = df_temp['x2'].values[0]; y2 = df_temp['y2'].values[0]
    x3 = df_temp['x3'].values[0]; y3 = df_temp['y3'].values[0]
    width = x1 - x0 # can change later
    height = y2 - y1 # can change later
    xmin = x0 # can change later
    ymin = y0 # can change later
    xmax = xmin + width
    ymax = ymin + height
    bbox = [np.floor(xmin).astype(np.int16),
            np.floor(ymin).astype(np.int16),
            np.ceil(xmax).astype(np.int16),
            np.ceil(ymax).astype(np.int16)]
    mask_list.append(mask_data[mask_slice,:,:])
    bbox_list.append(bbox)
    # num_of_slices_with_bbox_list.append(len(set(SOPInstanceUID_list)))
  # Calculate over the mask_list and bbox_list
  metric = tumor_fraction_in_boxes(mask_list, bbox_list)
  # We only keep unique series - as we want a metric per series instead of per SOPInstanceUID.
  # We need to group over SeriesInstanceUID as we want a list of the SegmentDescriptions
  df_seg_and_sr = (
    df_seg_and_sr.groupby(["PatientID", "StudyInstanceUID", "ReferencedSeriesInstanceUID"], as_index=False)
      .agg({"SegmentLabel": lambda x: sorted(x.unique().tolist())})
  )

  # Add to dataframe
  df_seg_and_sr['overlap_metric'] = metric
  # Add the number of slices that have seg
  # df_overlap['num_of_slices_with_seg'] = num_slices_in_seg
  df_seg_and_sr['num_of_slices_with_seg'] = len(df_seg)
  df_seg_and_sr['num_of_slices_with_seg']
  # Add the number of slices that have a bbox
  # df_overlap['num_of_slices_with_bbox'] = num_of_slices_with_bbox_list
  df_seg_and_sr['num_of_slices_with_bbox'] = len(set(SOPInstanceUID_list))
  # Add the number of slices that have a bounding box and tumor segmentation
  df_seg_and_sr['number_of_slices_with_bbox_and_seg'] = len(mask_list)
  # Add in the SeriesInstanceUID_seg
  # Add in the SeriesInstanceUID_bbox
  df_seg_and_sr['SeriesInstanceUID_seg'] = len(df_seg_and_sr) * [SeriesInstanceUID_seg]
  df_seg_and_sr['SeriesInstanceUID_bbox'] = len(df_seg_and_sr) * [SeriesInstanceUID_bbox]

  # Add to overall dataframe
  df_overlap_all = pd.concat([df_overlap_all, df_seg_and_sr], axis=0)



# Add the ohif url
viewer_url = ["https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs=" + f
              for f in df_overlap_all['StudyInstanceUID'].values]
df_overlap_all['viewer_url'] = viewer_url

# Add in the series specific viewer_url
series_viewer_url_list = ["https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs=" +
                          str(study) + "&SeriesInstanceUIDs=" + str(ref_series) + ',' + str(seg_series) + ',' + str(bbox_series)
                          for study,ref_series,seg_series,bbox_series in
                          zip(df_overlap_all['StudyInstanceUID'].values,
                              df_overlap_all['ReferencedSeriesInstanceUID'].values,
                              df_overlap_all['SeriesInstanceUID_seg'].values,
                              df_overlap_all['SeriesInstanceUID_bbox'].values)]
df_overlap_all['series_viewer_url'] = series_viewer_url_list

# Keep specific columns
df_overlap_all = df_overlap_all[['PatientID', 'StudyInstanceUID', 'ReferencedSeriesInstanceUID', 'SegmentLabel',
                                 'num_of_slices_with_bbox', 'num_of_slices_with_seg',
                                 'overlap_metric', 'number_of_slices_with_bbox_and_seg',
                                 'SeriesInstanceUID_seg', 'SeriesInstanceUID_bbox',
                                 'viewer_url', 'series_viewer_url']]
df_overlap_all = df_overlap_all.sort_values(by=['PatientID', 'StudyInstanceUID', 'ReferencedSeriesInstanceUID'])

# Save
df_overlap_all.to_csv("/content/sybil_and_nlstseg_overlap_per_series.csv")


Num series: 402
*** On index: 1 ***
num_slices_in_seg: 8
*** On index: 2 ***
num_slices_in_seg: 7
*** On index: 3 ***
num_slices_in_seg: 3
*** On index: 4 ***
num_slices_in_seg: 17
*** On index: 5 ***
num_slices_in_seg: 10
*** On index: 6 ***
num_slices_in_seg: 9
*** On index: 7 ***
num_slices_in_seg: 7
*** On index: 8 ***
num_slices_in_seg: 5
*** On index: 9 ***
num_slices_in_seg: 5
*** On index: 10 ***
num_slices_in_seg: 5
*** On index: 11 ***
num_slices_in_seg: 7
*** On index: 12 ***
num_slices_in_seg: 11
*** On index: 13 ***
num_slices_in_seg: 12
*** On index: 14 ***
num_slices_in_seg: 13
*** On index: 15 ***
num_slices_in_seg: 4
*** On index: 16 ***
num_slices_in_seg: 10
*** On index: 17 ***
num_slices_in_seg: 6
*** On index: 18 ***
num_slices_in_seg: 10
*** On index: 19 ***
num_slices_in_seg: 3
*** On index: 20 ***
num_slices_in_seg: 8
*** On index: 21 ***
num_slices_in_seg: 3
*** On index: 22 ***
num_slices_in_seg: 6
*** On index: 23 ***
num_slices_in_seg: 9
*** On index: 24 ***

In [34]:
df_overlap_all.head()

Unnamed: 0,PatientID,StudyInstanceUID,ReferencedSeriesInstanceUID,SegmentLabel,num_of_slices_with_bbox,num_of_slices_with_seg,overlap_metric,number_of_slices_with_bbox_and_seg,SeriesInstanceUID_seg,SeriesInstanceUID_bbox,viewer_url,series_viewer_url
0,100147,1.2.840.113654.2.55.31958452963320032523273261...,1.2.840.113654.2.55.15708941008648745210499888...,[Lesion 1 - Tumor - Upper lobe of right lung],8,8,0.949978,8,0 1.2.276.0.7230010.3.1.3.481037312.9241.17...,0 1.2.826.0.1.3680043.8.498.801370129816951...,https://viewer.imaging.datacommons.cancer.gov/...,https://viewer.imaging.datacommons.cancer.gov/...
0,100158,1.2.840.113654.2.55.81185422866512279860334872...,1.2.840.113654.2.55.31060976780967844152296392...,[Lesion 1 - Tumor - Upper lobe of right lung],7,7,1.0,7,1 1.2.276.0.7230010.3.1.3.481037312.10206.1...,1 1.2.826.0.1.3680043.8.498.393819214465834...,https://viewer.imaging.datacommons.cancer.gov/...,https://viewer.imaging.datacommons.cancer.gov/...
0,100242,1.2.840.113654.2.55.22835224307907880875083018...,1.2.840.113654.2.55.38995485391900019876570761...,[Lesion 1 - Tumor - Upper lobe of left lung],3,3,1.0,3,2 1.2.276.0.7230010.3.1.3.481037312.11172.1...,2 1.2.826.0.1.3680043.8.498.536518511118447...,https://viewer.imaging.datacommons.cancer.gov/...,https://viewer.imaging.datacommons.cancer.gov/...
0,100570,1.2.840.113654.2.55.88862626250387223376057356...,1.2.840.113654.2.55.32380467633296345717423514...,[Lesion 1 - Tumor - Upper lobe of left lung],7,10,0.98503,7,4 1.2.276.0.7230010.3.1.3.481037312.14098.1...,4 1.2.826.0.1.3680043.8.498.600859253326803...,https://viewer.imaging.datacommons.cancer.gov/...,https://viewer.imaging.datacommons.cancer.gov/...
0,100658,1.2.840.113654.2.55.17917436607731247122565007...,1.2.840.113654.2.55.26482353810847540114117846...,[Lesion 1 - Tumor - Lower lobe of left lung],5,9,1.0,5,5 1.2.276.0.7230010.3.1.3.481037312.15064.1...,5 1.2.826.0.1.3680043.8.498.639873790739774...,https://viewer.imaging.datacommons.cancer.gov/...,https://viewer.imaging.datacommons.cancer.gov/...
