# Local image processing of raw images from kaggle

This notebook processes the complete dataset from kaggle. It reduces the total amount of samples in one scan to a uniform amount for all the scans. (The cuts are selected in proportion)

CTs with fewer cuts than the desired amount are skipped.

The reduced dataset is saved in a new folder (defined by us) along with the new labels which also have the skipped CTs removed.

In [1]:
import os

import pandas as pd

%load_ext autoreload
%autoreload 2
from kaggle.kaggle_local_image_processing import get_images_information, copy_n_images_proportionally, get_reduced_df

In [2]:
labels_path = "/home/filiptkac/Downloads/train.csv"
images_path = "/home/filiptkac/Downloads/abd-trauma-dataset"
images_path_reduced = "/home/filiptkac/Downloads/abd-trauma-dataset-reduced"

In [3]:
minimum_images = 100
min_images, max_images, patients_too_few_images = get_images_information(images_path, minimum_images)

print(f"minimum number of images in CT: {min_images}, maximum number of images in CT: {max_images}")
print(f"number of CT scans with less than {minimum_images} images: {len(patients_too_few_images)}")

100%|██████████| 3147/3147 [00:03<00:00, 789.97it/s]

minimum number of images in CT: 46, maximum number of images in CT: 1727
number of CT scans with less than 100 images: 261





In [137]:
copy_n_images_proportionally(images_path, images_path_reduced, minimum_images, patients_too_few_images)

100%|██████████| 3147/3147 [02:33<00:00, 20.48it/s]


In [4]:
original_patients = os.listdir(images_path)
reduced_patients = os.listdir(images_path_reduced)
print(f"reduced patients amount: {len(reduced_patients)}")
print(f"expected reduced patients amount: {len(original_patients) - len(patients_too_few_images)}")

reduced patients amount: 2886
expected reduced patients amount: 2886


In [151]:
df = pd.read_csv(labels_path)
df.head(5)

Unnamed: 0,patient_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high,any_injury
0,10004,1,0,0,1,0,1,0,1,0,0,0,0,1,1
1,10005,1,0,1,0,1,0,0,1,0,0,1,0,0,0
2,10007,1,0,1,0,1,0,0,1,0,0,1,0,0,0
3,10026,1,0,1,0,1,0,0,1,0,0,1,0,0,0
4,10051,1,0,1,0,1,0,0,1,0,0,0,1,0,1


In [153]:
print(df.shape[0])
reduced_df = get_reduced_df(df, patients_too_few_images)
print(reduced_df.shape[0])
reduced_df.to_csv("/home/filiptkac/Downloads/train_reduced.csv")

3147
2886


In [155]:
reduced_patient_ids = reduced_df['patient_id'].values

# make sure there are existing scans for all the patients in the new dataframe
for pid in reduced_patient_ids:
    if not os.path.exists(f"{images_path_reduced}/{pid}"):
        print(f"{images_path_reduced}/{pid} does not exist!!")
        
print("folder analysis complete")

folder analysis complete


In [160]:
labels = reduced_df.iloc[0].drop('patient_id')
print(labels.values)

[1 0 0 1 0 1 0 1 0 0 0 0 1 1]
