In [1]:
import os
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
from PIL import Image
import numpy as np
import random
import shutil

In [2]:
# Definig path where dataset is located
desktop_path = os.path.expanduser("NSCLC_Final")

In [3]:
# Print the desktop_path variable
print(desktop_path) 

# Calculating number of classes & list of class labels
class_labels = os.listdir(desktop_path)  
num_classes = len(class_labels)  

# Printing all of these
print("Types of class labels found:", num_classes)  
print("Class labels:", class_labels)  

NSCLC_Final
Types of class labels found: 5
Class labels: ['Adenocarcinoma', 'Large_cell_carcinoma', 'Normal', 'Not_otherwise_specified', 'Squamous_cell_carcinoma']


In [4]:
data = []

# Creating a dataframe with number of class labels & list of images
for label in class_labels:
    folder_path = os.path.join(desktop_path, label)
    files = os.listdir(folder_path)
    for file in files:
        file_path = os.path.join(folder_path, file)
        data.append({'Labels': label, 'image': file_path})

df = pd.DataFrame(data)
print(df.head())

           Labels                                              image
0  Adenocarcinoma    NSCLC_Final\Adenocarcinoma\Adenocarcinoma_1.png
1  Adenocarcinoma   NSCLC_Final\Adenocarcinoma\Adenocarcinoma_10.png
2  Adenocarcinoma  NSCLC_Final\Adenocarcinoma\Adenocarcinoma_100.png
3  Adenocarcinoma  NSCLC_Final\Adenocarcinoma\Adenocarcinoma_1000...
4  Adenocarcinoma  NSCLC_Final\Adenocarcinoma\Adenocarcinoma_1001...


In [5]:
# Let's check how many samples for each category are present
print("Total number of images in the dataset: ", len(df))

label_count = df['Labels'].value_counts()
print(label_count)

Total number of images in the dataset:  51215
Labels
Squamous_cell_carcinoma    18769
Large_cell_carcinoma       13655
Not_otherwise_specified     7643
Adenocarcinoma              6018
Normal                      5130
Name: count, dtype: int64


In [6]:
# Define the data augmentation transformations
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.1,
    shear_range=0.1,
    fill_mode='nearest'
)

In [None]:
#directory = 'NSCLC_Final/Adenocarcinoma'
#save_dir = 'NSCLC_Final/Adenocarcinoma'
#directory = 'NSCLC_Final/Large_cell_carcinoma'
#save_dir = 'NSCLC_Final/Large_cell_carcinoma'
#directory = 'NSCLC_Radiomics_50k_R_C/Normal'
#save_dir = 'NSCLC_Radiomics_50k_R_C/Normal'
directory = 'NSCLC_Radiomics_50k_R_C/Not_otherwise_specified'
save_dir = 'NSCLC_Radiomics_50k_R_C/Not_otherwise_specified'
#directory = 'NSCLC_Radiomics_50k_R_C/Squamous_cell_carcinoma'
#save_dir = 'NSCLC_Radiomics_50k_R_C/Squamous_cell_carcinoma'
os.makedirs(save_dir, exist_ok=True)
#max_images = 15000
#max_images = 1000
#max_images = 1000
#max_images = 22000
max_images = 12000
i = 0


for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.png'):
            img_path = os.path.join(root, file)
            img = Image.open(img_path)
            img_array = np.array(img)

            img_array = img_array.reshape((1, img_array.shape[0], img_array.shape[1], 1))  # Reshape the array

            for batch in datagen.flow(img_array, batch_size=1, save_to_dir=save_dir, save_prefix='Not_otherwise_specified_', save_format='png'):
                i += 1
                if i >= max_images:
                    break
            if i >= max_images:
                break
    if i >= max_images:
        break


In [None]:
# Let's check how many samples for each category are present
print("Total number of images in the dataset: ", len(df))

label_count = df['Labels'].value_counts()
print(label_count)

In [None]:
directory = 'NSCLC/Large_cell_carcinoma'
save_dir = 'Practice/Large_cell_carcinoma'
os.makedirs(save_dir, exist_ok=True)
max_images = 500
i = 0


for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.png'):
            img_path = os.path.join(root, file)
            img = Image.open(img_path)
            img_array = np.array(img)

            img_array = img_array.reshape((1, img_array.shape[0], img_array.shape[1], 1))  # Reshape the array

            for batch in datagen.flow(img_array, batch_size=1, save_to_dir=save_dir, save_prefix='Large_cell_carcinoma_', save_format='png'):
                i += 1
                if i >= max_images:
                    break
            if i >= max_images:
                break
    if i >= max_images:
        break


In [None]:
directory = 'NSCLC/Normal'
save_dir = 'Practice/Normal'
os.makedirs(save_dir, exist_ok=True)
max_images = 500
i = 0


for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.png'):
            img_path = os.path.join(root, file)
            img = Image.open(img_path)
            img_array = np.array(img)

            img_array = img_array.reshape((1, img_array.shape[0], img_array.shape[1], 1))  # Reshape the array

            for batch in datagen.flow(img_array, batch_size=1, save_to_dir=save_dir, save_prefix='Normal_', save_format='png'):
                i += 1
                if i >= max_images:
                    break
            if i >= max_images:
                break
    if i >= max_images:
        break


In [None]:
directory = 'NSCLC/Not_otherwise_specified'
save_dir = 'NSCLC/Not_otherwise_specified'
os.makedirs(save_dir, exist_ok=True)
max_images = 500
i = 0


for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.png'):
            img_path = os.path.join(root, file)
            img = Image.open(img_path)
            img_array = np.array(img)

            img_array = img_array.reshape((1, img_array.shape[0], img_array.shape[1], 1))  # Reshape the array

            for batch in datagen.flow(img_array, batch_size=1, save_to_dir=save_dir, save_prefix='Squamous_cell_carcinoma_', save_format='png'):
                i += 1
                if i >= max_images:
                    break
            if i >= max_images:
                break
    if i >= max_images:
        break


In [None]:
directory = 'NSCLC/Squamous_cell_carcinoma'
save_dir = 'Practice/Squamous_cell_carcinoma'
os.makedirs(save_dir, exist_ok=True)
max_images = 500
i = 0


for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.png'):
            img_path = os.path.join(root, file)
            img = Image.open(img_path)
            img_array = np.array(img)

            img_array = img_array.reshape((1, img_array.shape[0], img_array.shape[1], 1))  # Reshape the array

            for batch in datagen.flow(img_array, batch_size=1, save_to_dir=save_dir, save_prefix='Not_otherwise_specified_', save_format='png'):
                i += 1
                if i >= max_images:
                    break
            if i >= max_images:
                break
    if i >= max_images:
        break

In [None]:
# Definig path where dataset is located
desktop_path = os.path.expanduser("Data")

In [None]:
data = []

# Creating a dataframe with number of class labels & list of images
for label in class_labels:
    folder_path = os.path.join(desktop_path, label)
    files = os.listdir(folder_path)
    for file in files:
        file_path = os.path.join(folder_path, file)
        data.append({'Labels': label, 'image': file_path})

df = pd.DataFrame(data)
print(df.head())

In [None]:
import os
from PIL import Image
import pandas as pd

# Function to resize and save images into their respective class folders
def resize_save_image(row, target_size, output_folder):
    img = Image.open(row['image'])
    img_resized = img.resize(target_size, Image.ANTIALIAS)
    
    # Calculate the cropping dimensions (90% crop)
    width, height = img_resized.size
    crop_width = int(width * 0.9)
    crop_height = int(height * 0.9)
    left = (width - crop_width) // 2
    top = (height - crop_height) // 2
    right = (width + crop_width) // 2
    bottom = (height + crop_height) // 2
    
    img_cropped = img_resized.crop((left, top, right, bottom))
    
    class_label = row['Labels']  
    filename = os.path.basename(row['image'])
    class_folder = os.path.join(output_folder, class_label)
    os.makedirs(class_folder, exist_ok=True)
    save_path = os.path.join(class_folder, filename)
    img_cropped.save(save_path)

# Set the target size for resizing
target_size = (330, 330)

# Set the output folder path
output_folder = os.path.expanduser("NSCLC_Radiomics_50k_R_C")  # Update the folder path as needed

# Assuming you have a DataFrame 'df' containing image paths and labels
# For example: df = pd.read_csv("image_labels.csv")

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Resize, crop, and save the images
for index, row in df.iterrows():
    resize_save_image(row, target_size, output_folder)

print("Image resizing and cropping completed.")

In [18]:
# Set the source directory
src_dir = "NSCLC_Radiomics_50k_R_C"
# Set the destination directory
dest_dir = "NSCLC_Radiomics_25k_splitted"
# Create the destination directory if it doesn't exist
os.makedirs(dest_dir, exist_ok=True)
# Define the classes
classes = ['Adenocarcinoma', 'Large_cell_carcinoma', 'Normal', 'Not_otherwise_specified', 'Squamous_cell_carcinoma']
# Define the train:test:validation ratio
ratios = [0.7, 0.15, 0.15]

# Create train, test, and validation folders in the destination directory
os.makedirs(os.path.join(dest_dir, "train"), exist_ok=True)
os.makedirs(os.path.join(dest_dir, "test"), exist_ok=True)
os.makedirs(os.path.join(dest_dir, "validation"), exist_ok=True)

# Loop over the classes
for c in classes:
    # Create the class directories in the train, test, and validation folders
    os.makedirs(os.path.join(dest_dir, "train", c), exist_ok=True)
    os.makedirs(os.path.join(dest_dir, "test", c), exist_ok=True)
    os.makedirs(os.path.join(dest_dir, "validation", c), exist_ok=True)

    # Get the list of image files for the class
    image_files = [f for f in os.listdir(os.path.join(src_dir, c)) if f.endswith('.png')]

    # Shuffle the list of image files
    random.shuffle(image_files)

    # Calculate the number of images for each split
    num_images = len(image_files)
    num_train = int(num_images * ratios[0])
    num_test = int(num_images * ratios[1])
    num_val = num_images - num_train - num_test

    # Move the images to the train, test, and validation directories
    for i in range(num_train):
        src_file = os.path.join(src_dir, c, image_files[i])
        dest_file = os.path.join(dest_dir, "train", c, image_files[i])
        shutil.copy(src_file, dest_file)

    for i in range(num_train, num_train + num_test):
        src_file = os.path.join(src_dir, c, image_files[i])
        dest_file = os.path.join(dest_dir, "test", c, image_files[i])
        shutil.copy(src_file, dest_file)

    for i in range(num_train + num_test, num_images):
        src_file = os.path.join(src_dir, c, image_files[i])
        dest_file = os.path.join(dest_dir, "validation", c, image_files[i])
        shutil.copy(src_file, dest_file)

print("Images organized successfully!")

Images organized successfully!
