<a href="https://colab.research.google.com/github/dibadabir/Project1_A24/blob/main/2nd_model_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
import shutil
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
source_folder = "/content/drive/MyDrive/Discipline Specific/HAM 10000"
dest_cancerous = "/content/drive/MyDrive/2nd Model/Cancerous_Lesions"
dest_non_cancerous = "/content/drive/MyDrive/2nd Model/Non_Cancerous_Lesions"

# Create destination folders if they don't exist
for folder in [dest_cancerous, dest_non_cancerous]:
    os.makedirs(folder, exist_ok=True)

# Load the balanced dataset
file_path = "/content/drive/MyDrive/Discipline Specific/HAM10000_metadata.csv"
df = pd.read_csv(file_path)

# Define cancerous and non-cancerous categories
cancerous_labels = ['mel', 'bcc', 'akiec']
non_cancerous_labels = ['nv', 'bkl', 'df', 'vasc']

# Function to copy images
def copy_images(df_subset, destination_folder):
    for _, row in df_subset.iterrows():
        image_filename = row['image_id'] + ".jpg"  # Assuming images are in .jpg format
        source_path = os.path.join(source_folder, image_filename)
        destination_path = os.path.join(destination_folder, image_filename)

        if os.path.exists(source_path):
            shutil.copy(source_path, destination_path)

# Select balanced samples (115 per category)
def balanced_sample(df, labels, sample_size=115):
    return pd.concat([df[df['dx'] == label].sample(n=sample_size, random_state=42, replace=False) for label in labels])

balanced_cancerous = balanced_sample(df, cancerous_labels, sample_size=115)
balanced_non_cancerous = balanced_sample(df, non_cancerous_labels, sample_size=115)

# Copy images to respective folders
copy_images(balanced_cancerous, dest_cancerous)
copy_images(balanced_non_cancerous, dest_non_cancerous)

print("Images copied successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Images copied successfully!


In [7]:
import os
import random
from google.colab import drive
import cv2
import numpy as np
from tqdm import tqdm


# Step 4: Define the DullRazor hair removal function
def remove_hair_dullrazor(image):
    """Removes hair from the input image using the DullRazor algorithm."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 17))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
    _, hair_mask = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)
    inpainted_image = cv2.inpaint(image, hair_mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
    return inpainted_image

# Step 5: Define augmentation functions
def adjust_brightness_contrast(image, brightness=10, contrast=10):
    brightness = random.randint(-brightness, brightness)
    contrast = random.randint(-contrast, contrast)
    image = np.int16(image)
    image = image * (contrast / 127 + 1) - contrast + brightness
    image = np.clip(image, 0, 255)
    return np.uint8(image)

def flip_image(image):
    return cv2.flip(image, 1)  # Horizontal flip

def rotate_image(image, angle):
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
    return cv2.warpAffine(image, matrix, (w, h))

# Step 6: Define the image processing function
def process_and_save_images(input_folder, total_images=1000):

    selected_images = [f for f in os.listdir(input_folder) if f.endswith(".jpg")]

    processed_count = 0
    for image_file in tqdm(selected_images):
        input_path = os.path.join(input_folder, image_file)
        image = cv2.imread(input_path)

        if image is None:
            print(f"Error reading image: {image_file}. Skipping...")
            continue

        # Remove hair
        image_no_hair = remove_hair_dullrazor(image)

        # Resize
        resized_image = cv2.resize(image_no_hair, (224, 224))

        # Normalize
        normalized_image = resized_image / 255.0  # Scaling
        normalized_image_uint8 = (normalized_image * 255).astype(np.uint8)

        # Save original processed image
        output_path = os.path.join(output_folder, f"{processed_count}.jpg")
        cv2.imwrite(output_path, normalized_image_uint8)
        processed_count += 1

        # Generate augmented images if needed
        augmentations = [
            flip_image(normalized_image_uint8),
            adjust_brightness_contrast(normalized_image_uint8)
        ]

        for aug_img in augmentations:
            if processed_count >= total_images:
                break
            aug_output_path = os.path.join(output_folder, f"{processed_count}.jpg")
            cv2.imwrite(aug_output_path, aug_img)
            processed_count += 1

        if processed_count >= total_images:
            break



In [9]:
# Step 2: Specify paths
cancerous_output = "/content/drive/MyDrive/2nd Model/Cancerous_Lesions_Preprocessed"
non_cancerous_output = "/content/drive/MyDrive/2nd Model/Non_Cancerous_Lesions_Preprocessed"

# Check if input and output folders exist
for input_folder, output_folder in [(dest_cancerous, cancerous_output), (dest_non_cancerous, non_cancerous_output)]:
  if not os.path.exists(input_folder):
      raise FileNotFoundError(f"The input folder '{input_folder}' does not exist.")
  if not os.path.exists(output_folder):
      os.makedirs(output_folder, exist_ok=True)

  process_and_save_images(input_folder, total_images=1000)



# Step 8: Count the number of images in the output folder
def count_images_in_folder(folder):
    num_images = len([f for f in os.listdir(folder) if f.endswith(".jpg")])
    print(f"\nTotal images in '{folder}': {num_images}")

count_images_in_folder(cancerous_output)
count_images_in_folder(non_cancerous_output)

 97%|█████████▋| 333/345 [01:05<00:02,  5.12it/s]
 72%|███████▏  | 333/460 [01:07<00:25,  4.95it/s]

Total images in '/content/drive/MyDrive/2nd Model/Cancerous_Lesions_Preprocessed': 1000
Total images in '/content/drive/MyDrive/2nd Model/Non_Cancerous_Lesions_Preprocessed': 1000



