In [None]:
pip install kagglehub

## Importing dataset

In [1]:
import os
from PIL import Image
import glob

# Specify the path to the dataset in your local repository
dataset_path = '/Users/apple/Desktop/PG/SEM-3/BIOS-511/bios-final/data'

# Check if the directory exists
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"The specified path {dataset_path} does not exist.")

# List all image files in the directory (assuming jpg and png images)
image_files = glob.glob(os.path.join(dataset_path, '**/*.jpg'), recursive=True)
image_files += glob.glob(os.path.join(dataset_path, '**/*.png'), recursive=True)

# Check if images are found
if not image_files:
    print("No images found in the specified directory.")
else:
    print(f"Found {len(image_files)} images in the dataset.")

# Load and display a sample image to confirm the dataset is accessible
sample_image = Image.open(image_files[0])
sample_image.show()  # Opens the first image in your default image viewer

# Optional: Load all images (for further processing)
images = [Image.open(img_path) for img_path in image_files]

print("Dataset successfully loaded from local repository.")


Found 9208 images in the dataset.
Dataset successfully loaded from local repository.


In [2]:
import os
import shutil
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [3]:
import os

dataset_path = '/Users/apple/Desktop/PG/SEM-3/BIOS-511/bios-final/data'
output_path = '/Users/apple/Desktop/PG/SEM-3/BIOS-511/bios-final/processed_data'

if not os.path.exists(output_path):
    os.makedirs(output_path)

## Finetuning and Augmenting Dataset

In [4]:
classes = ['Normal', 'Pneumonia-Bacterial', 'Pneumonia-Viral', 'COVID-19']

In [6]:
class_counts = {}
for cls in classes:
    class_dir = os.path.join(dataset_path, cls)
    class_counts[cls] = len(os.listdir(class_dir))

max_count = max(class_counts.values())
print("Class counts before balancing:", class_counts)

Class counts before balancing: {'Normal': 3270, 'Pneumonia-Bacterial': 3001, 'Pneumonia-Viral': 1656, 'COVID-19': 1281}


In [7]:
def augment_images(image_paths, num_new_images, save_dir):
    datagen = ImageDataGenerator(
        rotation_range=15,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.05,
        zoom_range=0.05,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    num_generated = 0
    while num_generated < num_new_images:
        for img_path in image_paths:
            img = Image.open(img_path)
            img = img.resize((224, 224))
            x = np.array(img)
            x = x.reshape((1,) + x.shape)

            prefix = os.path.splitext(os.path.basename(img_path))[0]
            for batch in datagen.flow(x, batch_size=1, save_to_dir=save_dir, save_prefix=prefix, save_format='jpg'):
                num_generated += 1
                if num_generated >= num_new_images:
                    break
        if num_generated >= num_new_images:
            break


In [8]:
target_count = 3500  # Adjust as needed

for cls in classes:
    class_dir = os.path.join(dataset_path, cls)
    images = [os.path.join(class_dir, fname) for fname in os.listdir(class_dir)]
    current_count = len(images)
    save_dir = os.path.join(output_path, cls)
    os.makedirs(save_dir, exist_ok=True)
    
    # Copy existing images
    for img_path in images:
        shutil.copy(img_path, save_dir)
    
    # Augment images if needed
    if current_count < target_count:
        num_new_images = target_count - current_count
        augment_images(images, num_new_images, save_dir)
    elif current_count > target_count:
        # Randomly select images to match target_count
        images_to_keep = random.sample(images, target_count)
        for img_path in images:
            if img_path not in images_to_keep:
                os.remove(os.path.join(save_dir, os.path.basename(img_path)))
    
    print(f"Class {cls} balanced to {target_count} images.")


Class Normal balanced to 3500 images.
Class Pneumonia-Bacterial balanced to 3500 images.
Class Pneumonia-Viral balanced to 3500 images.
Class COVID-19 balanced to 3500 images.


In [9]:
train_dir = os.path.join(output_path, 'train')
val_dir = os.path.join(output_path, 'validation')
test_dir = os.path.join(output_path, 'test')

for cls in classes:
    class_dir = os.path.join(output_path, cls)
    images = [os.path.join(class_dir, fname) for fname in os.listdir(class_dir)]
    
    train_images, temp_images = train_test_split(images, test_size=0.3, random_state=42)
    val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=42)
    
    # Function to copy images to destination
    def copy_images(image_list, destination):
        dest_dir = os.path.join(destination, cls)
        os.makedirs(dest_dir, exist_ok=True)
        for img_path in image_list:
            shutil.move(img_path, os.path.join(dest_dir, os.path.basename(img_path)))
    
    copy_images(train_images, train_dir)
    copy_images(val_images, val_dir)
    copy_images(test_images, test_dir)


##  Creating Generators

In [10]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [11]:
img_height, img_width = 224, 224  # Input size for the model
batch_size = 32

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    width_shift_range=0.05,
    height_shift_range=0.05,
    shear_range=0.05,
    zoom_range=0.05,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_test_datagen = ImageDataGenerator(rescale=1./255)


In [12]:
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical'
)

validation_generator = val_test_datagen.flow_from_directory(
    val_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical'
)

test_generator = val_test_datagen.flow_from_directory(
    test_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False  # Important for evaluation
)


Found 15950 images belonging to 4 classes.
Found 3419 images belonging to 4 classes.
Found 3420 images belonging to 4 classes.


## Model