### Data Project 4 | Parte I: Data Extraction
# Clasificación de imágenes de Rayos X
Grupo Los GermaÑoles: Maria, Franzi y Nacho

## Libraries

In [None]:
!pip3 install pydicom

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydicom
  Downloading pydicom-2.3.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.1


In [None]:
# Data Access and Management
from google.colab import drive
import zipfile
import os
import glob
import shutil

# Image Processing
from PIL import Image
import pydicom
import cv2

# Data Manipulation and Analysis
import pandas as pd
import numpy as np
import random

# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns

print('Libraries successfully installed.')

Libraries successfully installed.


## Load data from Kaggle
We use the Kaggle API token method to download data from a Kaggle competition in Google Colab:

1. Go to your Kaggle account settings page on the Kaggle website.

2. Scroll down to the section labeled "API" and click on the "Create New API Token" button. This will download a file named "kaggle.json" to your computer that will be then uploaded in GDrive.

3. Upload the "kaggle.json" file from your GDrive to your Google Colab environment.

4. Continue by executing the following code:

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Copy the kaggle.json file to the correct location
!mkdir -p ~/.kaggle
!cp '/content/drive/My Drive/kaggle.json' ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Create a folder where to save the downloaded data
destination_path = '/content/dataproject/'
os.makedir(destination_path)

# Download the competition data & unzip it
if not os.path.exists('/content/edem-mda-2022-23-data-project-4.zip'):
  !kaggle competitions download -c edem-mda-2022-23-data-project-4
  print('Data downloaded successfully.')
  with zipfile.ZipFile('/content/edem-mda-2022-23-data-project-4.zip', 'r') as data_to_unzip:
    data_to_unzip.extractall(destination_path)
    print('Data extracted successfully.')
else: print('Data already downloaded and extracted successfully.')

Mounted at /content/drive
Downloading edem-mda-2022-23-data-project-4.zip to /content
  6% 1.14G/19.1G [00:51<13:37, 23.6MB/s]
User cancelled operation
Data downloaded successfully.


BadZipFile: ignored

# EDA

## Get a first glance at images

In [None]:
# Get a list of all train and test DICOM files
all_train_files = glob.glob(f'train/*')
all_test_files = glob.glob(f'test/*')

In [None]:
'''# Randomly select 4 train and 4 test files
random_files = random.sample(all_train_files, 4) + random.sample(all_test_files, 4)

# Display the selected DICOM images
plt.figure(figsize=(12, 10))
plt.suptitle("8 Random Images from the Train & Test Dataset")

for i, file in enumerate(random_files):
    ds = pydicom.dcmread(file)
    plt.subplot(2, 4, i+1)
    plt.imshow(ds.pixel_array, cmap=plt.cm.gray)
    plt.title(f'Image {i+1}')
    plt.axis('off')

plt.tight_layout()
plt.show()'''

## Get metadata of images

In [None]:
# Get the first DICOM file in the folder train
file_name = next((file for file in os.listdir('train') if file.endswith('.dcm')), None)
file_path = os.path.join('train', file_name)
ds = pydicom.dcmread(file_path)

# Print the available metadata attributes
print(f"Metadata attributes for file: {file_name}")
metadata_list_all = dir(ds)
print(metadata_list_all)

In [None]:
 # Create dictionary to store the attribute values we need
metadata_list = ['SOPInstanceUID', 'Columns', 'Rows']
metadata_values = {}

# Initialize the dictionary with empty lists for each metadata attribute
for attribute_name in metadata_list:
    metadata_values[attribute_name] = []

# Iterate over all train files
for file_path in all_train_files:
    ds = pydicom.dcmread(file_path)

    # Iterate over the metadata attributes
    for attribute_name in metadata_list:
        # Check if the attribute exists in the DICOM object
        if hasattr(ds, attribute_name):
            attribute_value = getattr(ds, attribute_name)

            # Store the attribute value in the dictionary
            metadata_values[attribute_name].append(attribute_value)
        else:
            metadata_values[attribute_name].append(None)

# Create a pandas DataFrame from the metadata values
df_metadata = pd.DataFrame(metadata_values)
df_metadata = df_metadata.rename(columns={'Columns': 'Width', 'Rows':'Height'})

# Print the resulting DataFrame
df_metadata.head(5)

## Get target labels (body parts)

In [None]:
# Load train.csv to get target labels
train_data = pd.read_csv('train.csv')

# Define the bodyparts dictionary
bodyparts_dict = {
    0: 'Abdomen',
    1: 'Tobillo',
    2: 'Columna cervical',
    3: 'Tórax',
    4: 'Clavículas',
    5: 'Codo',
    6: 'Pies',
    7: 'Dedos',
    8: 'Antebrazo',
    9: 'Mano',
    10: 'Cadera',
    11: 'Rodilla',
    12: 'Pierna',
    13: 'Columna lumbar',
    14: 'Otros',
    15: 'Pelvis',
    16: 'Hombro',
    17: 'Senos paranasales',
    18: 'Cráneo',
    19: 'Muslo',
    20: 'Columna torácica',
    21: 'Muñeca'
}

# Add bodypart labels/ target to dataframe
df_metadata = df_metadata.merge(train_data, on='SOPInstanceUID', how='left')
df_metadata['BodyPart'] = df_metadata.Target.map(bodyparts_dict)
df_metadata.head()

## Descriptive Analytics

In [None]:
df_metadata.info()

In [None]:
df_metadata.describe(include='all')

In [None]:
# Count values for 'Height' and 'Width'
df_metadata.value_counts(["Height", "Width"])

In [None]:
# Distribution of 'Height'
plt.figure(figsize=(8, 6))
sns.histplot(data=df_metadata, x='Height')
plt.title('Distribution of Height')
plt.xlabel('Height')
plt.ylabel('Count')
plt.show()

# Distribution of 'Width'
plt.figure(figsize=(8, 6))
sns.histplot(data=df_metadata, x='Width')
plt.title('Distribution of Width')
plt.xlabel('Width')
plt.ylabel('Count')
plt.show()

# Relationship between 'Height' and 'Width'
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_metadata, x='Height', y='Width', s=200)
plt.title('Relationship between Height and Width')
plt.xlabel('Height')
plt.ylabel('Width')
plt.show()

In [None]:
# Count values for 'BodyPart'
df_metadata['BodyPart'].value_counts()

In [None]:
# Plot Body Parts Distribution
sns.countplot(data = df_metadata, x="BodyPart", order=df_metadata.BodyPart.value_counts().index)
plt.xticks(rotation = 'vertical')
plt.show()

# Transform images

1.   Load Dicom images & delete outliers based on size
2.   Change image format from .dcm to .jpg
3.  Resize images to 512 width




In [None]:
# Define function to load Dicom images & delete outliers (based on size)
def load_image(sample_path):
    ds = pydicom.dcmread(sample_path)
    img = ds.pixel_array
    p01 = np.percentile(img.ravel(), 1)
    p99 = np.percentile(img.ravel(), 99)
    img[img > p99] = p99
    img[img < p01] = p01
    if ds.data_element('PhotometricInterpretation').value == 'MONOCHROME1':
        img = np.invert(img)
    img = (img - img.min()) / (img.max() - img.min())
    return img

In [None]:
# Define function to change the data format
def change_format(imputdir, outdir):
    os.mkdir(outdir)  # create output folder
    img_list = [ f for f in  os.listdir(imputdir)]
    print(f'Start of converting process [to .jpg] for folder {imputdir}...')
    count = 0
    for f in img_list:
        ds = load_image(imputdir + f) # read dicom image
        cv2.imwrite(outdir + f.replace('.dcm','.jpg'),(ds*255).astype('uint8')) # write jpeg image
        count +=1
    print(f'\n End of converting process [to .jpg] for folder {imputdir}!')

In [None]:
# Define function to resize images
from PIL import Image

def change_size(imputdir, outdir):
    os.mkdir(outdir)
    img_list = [ f for f in  os.listdir(imputdir)]
    print(f'Start of resizing process [to 512 width while keeping aspect ratio] for folder {imputdir}...')
    for f in img_list:
        with Image.open(os.path.join(imputdir, f)) as img:
            width, height = img.size
            new_height = int(height * 512 / width)
            img = img.resize((512, new_height), Image.ANTIALIAS)
            img.save(os.path.join(outdir, f))
    print(f'\n End of resizing process [to 512 width while keeping aspect ratio] for folder {imputdir}!')

In [None]:
# Convert images from train & test folders to .jpg
change_format('/content/dataproject/train/', '/content/dataproject/train_jpg/')
change_format('/content/dataproject/test/', '/content/dataproject/test_jpg/')



1 images converted successfully.
2 images converted successfully.
3 images converted successfully.
4 images converted successfully.
5 images converted successfully.
6 images converted successfully.
7 images converted successfully.
8 images converted successfully.
9 images converted successfully.
10 images converted successfully.
11 images converted successfully.
12 images converted successfully.
13 images converted successfully.
14 images converted successfully.
15 images converted successfully.
16 images converted successfully.
17 images converted successfully.
18 images converted successfully.
19 images converted successfully.
20 images converted successfully.
21 images converted successfully.
22 images converted successfully.
23 images converted successfully.
24 images converted successfully.
25 images converted successfully.
26 images converted successfully.
27 images converted successfully.
28 images converted successfully.
29 images converted successfully.
30 images converted suc



31 images converted successfully.
32 images converted successfully.
33 images converted successfully.
34 images converted successfully.
35 images converted successfully.
36 images converted successfully.
37 images converted successfully.
38 images converted successfully.
39 images converted successfully.
40 images converted successfully.
41 images converted successfully.
42 images converted successfully.
43 images converted successfully.
44 images converted successfully.
45 images converted successfully.
46 images converted successfully.
47 images converted successfully.
48 images converted successfully.
49 images converted successfully.
50 images converted successfully.
51 images converted successfully.
52 images converted successfully.
53 images converted successfully.
54 images converted successfully.
55 images converted successfully.
56 images converted successfully.
57 images converted successfully.
58 images converted successfully.
59 images converted successfully.
60 images conv



695 images converted successfully.
696 images converted successfully.
697 images converted successfully.
698 images converted successfully.
699 images converted successfully.
700 images converted successfully.
701 images converted successfully.
702 images converted successfully.
703 images converted successfully.
704 images converted successfully.
705 images converted successfully.
706 images converted successfully.
707 images converted successfully.
708 images converted successfully.
709 images converted successfully.
710 images converted successfully.
711 images converted successfully.
712 images converted successfully.
713 images converted successfully.
714 images converted successfully.
715 images converted successfully.
716 images converted successfully.
717 images converted successfully.
718 images converted successfully.
719 images converted successfully.
720 images converted successfully.
721 images converted successfully.
722 images converted successfully.
723 images converted

In [None]:
change_size('/content/dataproject/train_jpg/', '/content/dataproject/train/')
change_size('/content/dataproject/test_jpg/', '/content/dataproject/test/')

Start of resizing process [to 512 width while keeping aspect ratio] for folder /content/train/...

 End of resizing process [to 512 width while keeping aspect ratio] for folder /content/train/!
Start of resizing process [to 512 width while keeping aspect ratio] for folder /content/test/...

 End of resizing process [to 512 width while keeping aspect ratio] for folder /content/test/!


In [None]:
# Test the resizing
image_path = '/content/dataproject/train_512/1.2.826.0.1.3680043.8.498.10025629581362719970278200333618114258.jpg'
image = Image.open(image_path)
image.size

(512, 623)

In [None]:
# Remove old/ unnecessary files
!rm /content/edem-mda-2022-23-data-project-4.zip
!rm -r /content/dataproject/train_jpg
!rm -r /content/dataproject/test_jpg
print('The old folders train_jpg & test_jpg, as well as the zip-file successfully removed.')

The old folders train_jpg & test_jpg successfully removed.


# Group train images by target label (body part) & move them in subfolders
Since the data augmentation depends on the target and thus, is managed in different ways, the images are grouped into subfolders according to the body part they portray.

In [None]:
# Read train.csv
df_train = pd.read_csv('/content/dataproject/train.csv')

# Specify the train directory
train_directory = '/content/dataproject/train'

for index, row in df_train.iterrows():
    file_name = str(row[0])  # file name is in the first column
    file_class = str(row[1])  # class is in the second column

    # Define full file name
    full_file_name = f'{file_name}.jpg'

    # Create the full path to the class folder
    class_folder_path = os.path.join(train_directory, file_class)

    # Check if class folder exists, create it if not
    if not os.path.exists(class_folder_path):
        os.makedirs(class_folder_path)
        print('Directory ', file_class, ' created.')
    else:
        print('Directory ', file_class, ' already exists.')

    # Move the files to respective class folder
    if os.path.exists(os.path.join(train_directory, full_file_name)):
        shutil.move(os.path.join(train_directory, full_file_name), os.path.join(class_folder_path, full_file_name))
        print('The file ', full_file_name, ' moved to ', class_folder_path, '.')
    else:
        print('The file ', full_file_name, ' does not exist.')

Directory  14  created.
The file  1.2.826.0.1.3680043.8.498.10062189329714053601496804394945741428.jpg  moved to  /content/train/14 .
Directory  3  created.
The file  1.2.826.0.1.3680043.8.498.53411283183733547704967879802673908605.jpg  moved to  /content/train/3 .
Directory  3  already exists.
The file  1.2.826.0.1.3680043.8.498.12955484645689261949928855045724356601.jpg  moved to  /content/train/3 .
Directory  3  already exists.
The file  1.2.826.0.1.3680043.8.498.10408703585974384892701383895533277303.jpg  moved to  /content/train/3 .
Directory  9  created.
The file  1.2.826.0.1.3680043.8.498.30255106562321206269035908958425925148.jpg  moved to  /content/train/9 .
Directory  0  created.
The file  1.2.826.0.1.3680043.8.498.80404681981834387091293069933458998586.jpg  moved to  /content/train/0 .
Directory  2  created.
The file  1.2.826.0.1.3680043.8.498.16743022096024694044879854041924202666.jpg  moved to  /content/train/2 .
Directory  2  already exists.
The file  1.2.826.0.1.3680043.

# Save images in GDrive for further classification

In [None]:
# Zip all contents
!zip -r dataproject_images.zip dataproject/
print('All contents successfully zipped.')

  adding: dataproject_images/ (stored 0%)
  adding: dataproject_images/test/ (stored 0%)
  adding: dataproject_images/test/1.2.826.0.1.3680043.8.498.96118226411137430398376066700344314247.jpg (deflated 0%)
  adding: dataproject_images/test/1.2.826.0.1.3680043.8.498.67789794435741873870140624318123072500.jpg (deflated 0%)
  adding: dataproject_images/test/1.2.826.0.1.3680043.8.498.99979421603039894766639159445609592178.jpg (deflated 0%)
  adding: dataproject_images/test/1.2.826.0.1.3680043.8.498.81324253828056400238835697704001729028.jpg (deflated 0%)
  adding: dataproject_images/test/1.2.826.0.1.3680043.8.498.93414988560157361543313648560973656452.jpg (deflated 1%)
  adding: dataproject_images/test/1.2.826.0.1.3680043.8.498.89471984874914105305851256436952722376.jpg (deflated 2%)
  adding: dataproject_images/test/1.2.826.0.1.3680043.8.498.89317032683934050993384638790227083288.jpg (deflated 0%)
  adding: dataproject_images/test/1.2.826.0.1.3680043.8.498.85815712905123921161690933811847

In [None]:
# Move zip-file to GDrive
!mv /content/dataproject_images.zip /content/drive/MyDrive/dataproject_images.zip
print('Zip file moved successfully to GDrive. \n Now continue with the second or third colab notebook.')