<a href="https://colab.research.google.com/github/alecseiterr/pleural_effusion/blob/main/Dmitrii_Utkin/dataset_features_and_labels_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Copy and run locally due to large dataset size. All imported packages must be installed.


In [1]:
import SimpleITK as sitk
import numpy as np
import pandas as pd
import glob as glob
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
labels_path = "/Users/dutking/LOCAL/AI_uni/radlogix/dataset/effusions_052023"  # /LUNG1-001/LUNG1-001_effusion_first_reviewer.nii.gz
features_path = "/Users/dutking/LOCAL/AI_uni/radlogix/dataset/features"  # /LUNG1-001/09-18-2008-StudyID-NA-69331/0.000000-NA-82046
csv_path = "/Users/dutking/LOCAL/AI_uni/radlogix/_documents/Thoracic and Pleural Effusion Segmentations April 2020.csv"

In [3]:
df = pd.read_csv(csv_path)
df.set_index("PatientID", inplace=True)

# Gethering features information


Creating dataframe to check amount of slices.


In [4]:
features_df = pd.DataFrame(columns=["PatientID", "Feature.Slices"])
features_df.set_index("PatientID", inplace=True)

Filling dataframe with actual data.


In [5]:
features_folders = sorted(glob.glob(f"{features_path}/*"))

for folder in features_folders:
    id = folder.split("/")[-1]
    dicom_folder = glob.glob(f'{glob.glob(f"{folder}/*")[0]}/*')[0]
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(dicom_folder)
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    slices = image.GetSize()[2]
    features_df.loc[id, "Feature.Slices"] = int(slices)

In [6]:
features_merged_df = pd.merge(
    df.loc[:, "Dim.z"],
    features_df.astype(np.int16),
    left_index=True,
    right_index=True,
    how="left",
)
features_merged_df.head(5)

Unnamed: 0_level_0,Dim.z,Feature.Slices
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
LUNG1-001,134.0,134.0
LUNG1-002,111.0,111.0
LUNG1-003,,
LUNG1-004,114.0,114.0
LUNG1-005,91.0,91.0


Check for differences in information in csv and actual files.


In [7]:
features_merged_df["Feature.Slices"].equals(features_merged_df["Dim.z"])

False

In [8]:
features_merged_df.loc[
    (features_merged_df["Dim.z"] != features_merged_df["Feature.Slices"])
]

Unnamed: 0_level_0,Dim.z,Feature.Slices
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
LUNG1-003,,
LUNG1-014,,
LUNG1-021,,
LUNG1-031,,
LUNG1-058,,
LUNG1-061,,
LUNG1-069,,
LUNG1-074,,
LUNG1-083,113.0,
LUNG1-085,,


Checking if amount of slices in csv and actual is equal.


In [9]:
clean_features_merged_df = features_merged_df.dropna(
    axis=0, subset=["Feature.Slices", "Dim.z"]
).astype(np.int16)
clean_features_merged_df["Feature.Slices"].equals(clean_features_merged_df["Dim.z"])

True

Marking features as valid if both csv data and actual file exist.


In [10]:
features_merged_df["Valid.Feature"] = np.where(
    (features_merged_df.isna()["Dim.z"] == False)
    & (features_merged_df.isna()["Feature.Slices"] == False),
    1,
    np.nan,
)
features_merged_df.drop(columns=["Dim.z"], inplace=True)
features_merged_df.head(15)

Unnamed: 0_level_0,Feature.Slices,Valid.Feature
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1
LUNG1-001,134.0,1.0
LUNG1-002,111.0,1.0
LUNG1-003,,
LUNG1-004,114.0,1.0
LUNG1-005,91.0,1.0
LUNG1-006,114.0,1.0
LUNG1-007,129.0,1.0
LUNG1-008,114.0,1.0
LUNG1-009,105.0,1.0
LUNG1-010,91.0,1.0


# Gathering labels information


## Creating dataframe for labels


List of previously excluded labels based on radiologist's reports.


Shrinking dataframe to rows with existing effusion events.


In [11]:
labels_df = df.loc[(df["Effusion.Event"] == 1.0)]
labels_df.head(5)

Unnamed: 0_level_0,Carcinoma.Laterality,GTV1,GTV2,GTV3,GTV4,GTV5,GTV6,Tumor.Location,Effusion.Event,Primary.Effusion.Reviewer,...,RO1-RO3.Thorax.DSC,Rad1-Rad3.Thorax.DSC,Rad1-Rad2.Thorax.DSC,RO2-Rad2.Thorax.DSC,Dim.x,Dim.y,Dim.z,Voxel.Space.x,Voxel.Space.y,Voxel.space.z
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUNG1-001,L,139.06,,,,,,3.0,1.0,Rad4,...,,,,,512.0,512.0,134.0,0.976563,0.976563,3.0
LUNG1-002,R,340.3,,,,,,3.0,1.0,Rad4,...,,,,,512.0,512.0,111.0,0.977,0.977,3.0
LUNG1-005,R,78.62,,,,,,1.0,1.0,Rad4,...,,,,,512.0,512.0,91.0,0.977,0.977,3.0
LUNG1-008,R,37.48,,,,,,3.0,1.0,Rad4,...,,,,,512.0,512.0,114.0,0.977,0.977,3.0
LUNG1-013,L,13.25,,,,,,1.0,1.0,Rad4,...,,,,,512.0,512.0,134.0,0.976563,0.976563,3.0


## Helper functions


Converting of image to array of slices.


In [12]:
def get_image_array(path):
    nifti = sitk.ReadImage(path, imageIO="NiftiImageIO")
    return sitk.GetArrayFromImage(nifti)

Getting amount of nonzero voxels in every slice.


In [13]:
def get_values_by_slices(arr):
    return np.array([np.sum(abs(i)) for i in arr.astype(bool)])

Counting gaps:

1. Setting image boundaries
2. Searching for 0s


In [14]:
def get_gaps(arr, threshold):
    """
    args:
        arr: 1D array of numbers of voxels in the slice
        threshold: int, maximum amount of voxels treated as gap
    return:
        array: [ndarray]
    """
    image_boundaries = [np.flatnonzero(arr)[0], (np.flatnonzero(arr)[-1])]
    counter = {"indices": []}
    gaps = []

    for idx, item in enumerate(arr):
        if idx > image_boundaries[0] and idx < image_boundaries[1]:
            if item <= threshold:
                counter["indices"].append(idx + 1)
            elif item > threshold and len(counter["indices"]) > 0:
                gaps.append(np.array(counter["indices"]))
                counter["indices"] = []

    return gaps

Counting outliers:

1. Setting image boundaries
2. Counting consequent non-empty slices
3. Comparing amount of consequent non-empty slices with threshold


In [15]:
def get_outliers(arr, threshold):
    """
    args:
        arr: 1D array
        threshold: int, maximum amount of slices treated as outliers
    return:
        array: [ndarray]
    """
    image_boundaries = [np.flatnonzero(arr)[0], (np.flatnonzero(arr)[-1])]
    outliers = []
    counter = {"indices": []}

    for idx, item in enumerate(arr):
        if idx >= image_boundaries[0] and idx <= image_boundaries[1]:
            if item != 0:
                counter["indices"].append(idx + 1)
            elif item == 0:
                if len(counter["indices"]) > 0 and len(counter["indices"]) <= threshold:
                    outliers.append(np.array(counter["indices"]))

                counter["indices"] = []

    if len(counter["indices"]) > 0 and len(counter["indices"]) <= threshold:
        outliers.append(np.array(counter["indices"]))

    return outliers

Getting data about gaps and outliers, amount of slices.


In [16]:
def is_empty(image_array):
    if np.sum(image_array) == 0:
        return True

    return False

In [17]:
def check_label(path):
    image_array = get_image_array(path)
    values_by_slices = get_values_by_slices(image_array)
    gaps = get_gaps(values_by_slices, 5)
    outliers = get_outliers(values_by_slices, 3)
    return {
        "gaps": gaps,
        "outliers": outliers,
        "amount_of_slices": image_array.shape[0],
    }

Define if label is suspicious based on presence of gaps or outliers.


In [18]:
def is_suspicious(label):
    return len(label["gaps"]) > 0 or len(label["outliers"]) > 0

Getting data about current file:

1. reviewer
2. patient id


In [19]:
def get_patien_id_form_path_data(path):
    path_arr = path.split("/")
    patient_id = path_arr[8]
    return patient_id

Building report on amount of label slices, gaps and outliers:

1. Creating report dataframe
2. Checking label
3. Adding row to report


In [21]:
def collect_labels_data(path):
    report_df = pd.DataFrame(
        columns=[
            "PatientID",
            "Valid.Label",
            "Label.Slices",
            "Label.Gaps",
            "Label.Outliers",
        ]
    )
    report_df.set_index("PatientID")
    report_df.drop("PatientID", axis=1, inplace=True)

    folders = sorted(glob.glob(f"{path}/*"))

    for folder in folders:
        patient_id = get_patien_id_form_path_data(folder)
        file = sorted(glob.glob(f"{folder}/*"))[0]
        if not is_empty(get_image_array(file)):
            label_data = check_label(file)
            report_df.loc[patient_id, "Label.Slices"] = int(
                label_data["amount_of_slices"]
            )
            if is_suspicious(label_data):
                report_df.loc[patient_id, "Valid.Label"] = 0
                if len(label_data["gaps"]) > 0:
                    report_df.loc[patient_id, "Label.Gaps"] = label_data["gaps"]
                    issues_found = True

                if len(label_data["outliers"]) > 0:
                    report_df.loc[patient_id, "Label.Outliers"] = label_data["outliers"]
                    issues_found = True
            else:
                report_df.loc[patient_id, "Valid.Label"] = 1

    return report_df

## Building final report


Building report on labels.


In [22]:
labels_report = collect_labels_data(labels_path)

In [23]:
labels_report.describe()

Unnamed: 0,Valid.Label,Label.Slices,Label.Gaps,Label.Outliers
count,77,77,20,7
unique,2,37,20,7
top,1,134,"[[84, 85, 86, 87], [89]]","[[86], [88]]"
freq,57,23,1,1


Combining dataframes on features and labels.


In [24]:
features_labels_df = pd.merge(
    features_merged_df, labels_report, left_index=True, right_index=True, how="left"
)
features_labels_df.head(5)

Unnamed: 0_level_0,Feature.Slices,Valid.Feature,Valid.Label,Label.Slices,Label.Gaps,Label.Outliers
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LUNG1-001,134.0,1.0,0.0,134.0,"[[84, 85, 86, 87], [89]]","[[86], [88]]"
LUNG1-002,111.0,1.0,1.0,111.0,,
LUNG1-003,,,,,,
LUNG1-004,114.0,1.0,,,,
LUNG1-005,91.0,1.0,0.0,91.0,"[[26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, ...",


In [25]:
features_labels_df.dropna(axis=0, subset=["Feature.Slices"], inplace=True)
features_labels_df.loc[features_labels_df["Valid.Label"] == 0].reset_index().to_csv(
    "./_docs/features_and_labels_report_UPDATED_LABELS.csv", index=False
)

Checking consistency along amount of slices.


In [26]:
clean_features_labels_df = features_labels_df.dropna(
    axis=0, subset=["Feature.Slices", "Label.Slices"]
)

clean_features_labels_df.head(5)

Unnamed: 0_level_0,Feature.Slices,Valid.Feature,Valid.Label,Label.Slices,Label.Gaps,Label.Outliers
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LUNG1-001,134.0,1.0,0,134,"[[84, 85, 86, 87], [89]]","[[86], [88]]"
LUNG1-002,111.0,1.0,1,111,,
LUNG1-005,91.0,1.0,0,91,"[[26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, ...",
LUNG1-008,114.0,1.0,1,114,,
LUNG1-013,134.0,1.0,0,134,"[[79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, ...","[[77, 78], [107], [109, 110]]"


Comparing slices for features and first review.


In [27]:
clean_features_labels_df["Feature.Slices"].astype(np.int16).equals(
    clean_features_labels_df["Label.Slices"].astype(np.int16)
)

True

Merging report with initial dataset csv file.


In [33]:
final_df = pd.merge(
    df, features_labels_df, left_index=True, right_index=True, how="left"
)

final_df.head(10)

Unnamed: 0_level_0,Carcinoma.Laterality,GTV1,GTV2,GTV3,GTV4,GTV5,GTV6,Tumor.Location,Effusion.Event,Primary.Effusion.Reviewer,...,Dim.z,Voxel.Space.x,Voxel.Space.y,Voxel.space.z,Feature.Slices,Valid.Feature,Valid.Label,Label.Slices,Label.Gaps,Label.Outliers
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LUNG1-001,L,139.06,,,,,,3.0,1.0,Rad4,...,134.0,0.976563,0.976563,3.0,134.0,1.0,0.0,134.0,"[[84, 85, 86, 87], [89]]","[[86], [88]]"
LUNG1-002,R,340.3,,,,,,3.0,1.0,Rad4,...,111.0,0.977,0.977,3.0,111.0,1.0,1.0,111.0,,
LUNG1-003,,,,,,,,,,,...,,,,,,,,,,
LUNG1-004,L,86.5,7.06,70.38,,,,3.0,0.0,,...,114.0,0.976563,0.976563,3.0,114.0,1.0,,,,
LUNG1-005,R,78.62,,,,,,1.0,1.0,Rad4,...,91.0,0.977,0.977,3.0,91.0,1.0,0.0,91.0,"[[26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, ...",
LUNG1-006,L,75.9,,,,,,1.0,0.0,,...,114.0,0.977,0.977,3.0,114.0,1.0,,,,
LUNG1-007,R,9.62,15.38,,,,,1.0,0.0,,...,129.0,0.976563,0.976563,3.0,129.0,1.0,,,,
LUNG1-008,R,37.48,,,,,,3.0,1.0,Rad4,...,114.0,0.977,0.977,3.0,114.0,1.0,1.0,114.0,,
LUNG1-009,R,91.85,32.48,37.24,83.87,,,3.0,0.0,,...,105.0,0.977,0.977,3.0,105.0,1.0,,,,
LUNG1-010,R,15.5,5.72,,,,,1.0,0.0,,...,91.0,0.977,0.977,3.0,91.0,1.0,,,,


Saving clean version to build dataset.


In [34]:
clean_df = final_df.loc[
    (final_df["Valid.Feature"] > 0) & (final_df["Valid.Label"] != 0)
]
clean_df.drop(
    clean_df.loc[
        (clean_df["Feature.Slices"] > 136) | (clean_df["Feature.Slices"] < 88)
    ].index,
    axis=0,
    inplace=True,
)

clean_df.loc[
    "LUNG1-170", "Effusion.Event"
] = 0.0  # correction from radlogix radiologist

clean_df.reset_index(inplace=True)
clean_df.to_csv("./_docs/clean_df_on_latest_ds.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df.drop(


In [44]:
clean_df.loc[(clean_df["Dim.z"] < 140) & ((clean_df["Dim.z"] > 100))]
print(len(clean_df.loc[(clean_df["Dim.z"] < 140) & ((clean_df["Dim.z"] > 100))]))

290


Saving to csv.


In [47]:
final_df.reset_index(inplace=True)
final_df.to_csv("./_docs/dataset_features_and_labels_report_UPDATED.csv", index=False)