# Local image processing of raw images from kaggle

This notebook processes the complete dataset from kaggle. It reduces the total amount of samples in one scan to a uniform amount for all the scans. (The cuts are selected in proportion)

CTs with fewer cuts than the desired amount are skipped.

The reduced dataset is saved in a new folder (defined by us) along with the new labels which also have the skipped CTs removed.

In [2]:
import os

import pandas as pd

%load_ext autoreload
%autoreload 2
from kaggle.kaggle_local_image_processing import get_images_information, copy_n_images_proportionally, get_reduced_df

In [ ]:
labels_path = "/home/filiptkac/Downloads/train.csv"
images_path = "/home/filiptkac/Downloads/abd-trauma-dataset"
images_path_reduced = "/home/filiptkac/Downloads/abd-trauma-dataset-reduced-64"
labels_path_reduced = "/home/filiptkac/Downloads/train_reduced_64.csv"

In [ ]:
minimum_images = 64
min_images, max_images, patients_too_few_images = get_images_information(images_path, minimum_images)

print(f"minimum number of images in CT: {min_images}, maximum number of images in CT: {max_images}")
print(f"number of CT scans with less than {minimum_images} images: {len(patients_too_few_images)}")

In [ ]:
copy_n_images_proportionally(images_path, images_path_reduced, minimum_images, patients_too_few_images)

In [ ]:
original_patients = os.listdir(images_path)
reduced_patients = os.listdir(images_path_reduced)
print(f"reduced patients amount: {len(reduced_patients)}")
print(f"expected reduced patients amount: {len(original_patients) - len(patients_too_few_images)}")

In [ ]:
df = pd.read_csv(labels_path)
df.head(5)

In [ ]:
print(df.shape[0])
reduced_df = get_reduced_df(df, patients_too_few_images)
print(reduced_df.shape[0])
reduced_df.to_csv(labels_path_reduced)

In [ ]:
reduced_patient_ids = reduced_df['patient_id'].values

# make sure there are existing scans for all the patients in the new dataframe
for pid in reduced_patient_ids:
    if not os.path.exists(f"{images_path_reduced}/{pid}"):
        print(f"{images_path_reduced}/{pid} does not exist!!")

print("folder analysis complete")

In [9]:
labels = reduced_df.iloc[0].drop('patient_id')
print(labels.values)

[1 0 0 1 0 1 0 1 0 0 0 0 1 1]


In [10]:
print(df.head(-1))

columns_to_drop = ['Unnamed: 0','any_injury', 'spleen_high', 'spleen_low', 'liver_low', 'liver_high', 'kidney_high', 'kidney_low', 'bowel_injury', 'extravasation_injury']

df.drop(columns=columns_to_drop, inplace=True)
df.head(-1)
df = df.rename(columns={'bowel_healthy': 'bowel',
                        'extravasation_healthy': 'extravasation',
                        'kidney_healthy': 'kidney',
                        'liver_healthy': 'liver',
                        'spleen_healthy': 'spleen'})

print(df.columns.values)

df.to_csv('reduced_labels.csv', index=False)

      patient_id  bowel_healthy  bowel_injury  extravasation_healthy  \
0          10004              1             0                      0   
1          10005              1             0                      1   
2          10007              1             0                      1   
3          10026              1             0                      1   
4          10051              1             0                      1   
...          ...            ...           ...                    ...   
3141        9860              1             0                      1   
3142        9951              1             0                      1   
3143        9960              1             0                      1   
3144        9961              1             0                      1   
3145        9980              1             0                      1   

      extravasation_injury  kidney_healthy  kidney_low  kidney_high  \
0                        1               0           1          

KeyError: "['Unnamed: 0'] not found in axis"