<a href="https://colab.research.google.com/github/gvvishal/-Intelligent-Clothing-Stock-Management-System-using-AI-and-SQL/blob/main/PR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  STEP 1: Mount Google Drive


In [28]:
from google.colab import drive
drive.mount('/content/drive')

# Set dataset path
dataset_path = "/content/drive/MyDrive/Binary_Classification"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#  STEP 2: Check Dataset Structure

In [29]:
import os

for split in ["Train", "Test"]:
    for cls in ["Tumor", "No_Tumor"]:
        folder = os.path.join(dataset_path, split, cls)
        print(f"{split}/{cls}: {len(os.listdir(folder))} images")




Train/Tumor: 3904 images
Train/No_Tumor: 1058 images
Test/Tumor: 755 images
Test/No_Tumor: 139 images


#  STEP 3: Remove Duplicate Images

In [30]:
import hashlib
from collections import defaultdict
from PIL import Image

def hash_image(image_path):
    """Return a hash for an image (so duplicates can be detected)."""
    try:
        with Image.open(image_path) as img:
            img = img.convert("RGB").resize((256, 256))  # normalize size
            return hashlib.md5(img.tobytes()).hexdigest()
    except Exception as e:
        print(f"Error reading {image_path}: {e}")
        return None

# Step 3.1: Collect hashes
hash_map = defaultdict(list)

for split in ["Train", "Test"]:
    for cls in ["Tumor", "No_Tumor"]:
        folder = os.path.join(dataset_path, split, cls)
        for fname in os.listdir(folder):
            fpath = os.path.join(folder, fname)
            img_hash = hash_image(fpath)
            if img_hash:
                hash_map[img_hash].append(fpath)

# Step 3.2: Decide which duplicates to delete
to_delete = []

for img_hash, files in hash_map.items():
    if len(files) > 1:
        train_files = [f for f in files if "/Train/" in f]
        test_files  = [f for f in files if "/Test/" in f]

        # Rule 1: Keep 1 in Train, delete rest
        if len(train_files) > 1:
            to_delete.extend(train_files[1:])

        # Rule 2: If same image appears in Train and Test → delete Test copies
        if train_files and test_files:
            to_delete.extend(test_files)

        # Rule 3: If only in Test, keep 1, delete rest
        if not train_files and len(test_files) > 1:
            to_delete.extend(test_files[1:])

# Step 3.3: Delete duplicates
for f in to_delete:
    try:
        os.remove(f)
        print(f"Deleted duplicate: {f}")
    except Exception as e:
        print(f"Error deleting {f}: {e}")

print(f"\n✅ Done! Removed {len(to_delete)} duplicates.")


✅ Done! Removed 0 duplicates.


In [35]:
import os

for split in ["Train", "Test"]:
    for cls in ["Tumor", "No_Tumor"]:
        folder = os.path.join(dataset_path, split, cls)
        print(f"{split}/{cls}: {len(os.listdir(folder))} images")




Train/Tumor: 3904 images
Train/No_Tumor: 1058 images
Test/Tumor: 755 images
Test/No_Tumor: 139 images


#  STEP 4: Image Generators (Data Loading)


In [31]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_dir = os.path.join(dataset_path, "Train")
test_dir  = os.path.join(dataset_path, "Test")

IMG_SIZE = (224, 224)
BATCH_SIZE = 32

# Train with augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    validation_split=0.2
)

# Validation (no augmentation, only rescale)
val_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

# Test (no augmentation)
test_datagen = ImageDataGenerator(rescale=1./255)

# Generators
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="binary",
    subset="training"
)

val_generator = val_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="binary",
    subset="validation"
)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="binary",
    shuffle=False
)


Found 3971 images belonging to 2 classes.
Found 991 images belonging to 2 classes.
Found 894 images belonging to 2 classes.


In [23]:

for split in ["Train", "Test"]:
    for cls in ["Tumor", "No_Tumor"]:
        folder = os.path.join(dataset_path, split, cls)
        print(f"{split}/{cls}: {len(os.listdir(folder))} images")


Train/Tumor: 3904 images
Train/No_Tumor: 1058 images
Test/Tumor: 755 images
Test/No_Tumor: 139 images


#STEP 5: Handle Class Imbalance

In [33]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Count training images
tumor_count = len(os.listdir(os.path.join(train_dir, "Tumor")))
no_tumor_count = len(os.listdir(os.path.join(train_dir, "No_Tumor")))

print(f"Train/Tumor: {tumor_count}, Train/No_Tumor: {no_tumor_count}")

# Assign class weights (more weight to minority class)
classes = np.array([0]*no_tumor_count + [1]*tumor_count)  # 0=No_Tumor, 1=Tumor
weights = compute_class_weight(class_weight="balanced", classes=np.unique(classes), y=classes)
class_weight = dict(zip(np.unique(classes), weights))

print("Class Weights:", class_weight)



Train/Tumor: 3904, Train/No_Tumor: 1058
Class Weights: {np.int64(0): np.float64(2.3449905482041586), np.int64(1): np.float64(0.6355020491803278)}


In [37]:
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import Sequence
import os
import random
from PIL import Image

# =========================
# 1. Parameters
# =========================
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
train_dir = os.path.join(dataset_path, "Train")
classes = ["No_Tumor", "Tumor"]  # class order

# =========================
# 2. Custom Generator with Oversampling
# =========================
class BalancedDataGenerator(Sequence):
    def __init__(self, directory, classes, batch_size, img_size, augmentor=None):
        self.directory = directory
        self.classes = classes
        self.batch_size = batch_size
        self.img_size = img_size
        self.augmentor = augmentor

        # Collect file paths per class
        self.file_paths = []
        self.labels = []
        for idx, cls in enumerate(classes):
            cls_folder = os.path.join(directory, cls)
            files = [os.path.join(cls_folder, f) for f in os.listdir(cls_folder)]
            self.file_paths.extend(files)
            self.labels.extend([idx] * len(files))

        # Separate minority and majority class indices
        counts = [self.labels.count(i) for i in range(len(classes))]
        self.max_count = max(counts)
        self.indices_per_class = {i: [j for j, lbl in enumerate(self.labels) if lbl == i] for i in range(len(classes))}

    def __len__(self):
        # Total batches per epoch
        return int(np.ceil(self.max_count * len(self.classes) / self.batch_size))

    def __getitem__(self, idx):
        batch_paths = []
        batch_labels = []

        # Oversample minority class
        for cls_idx in range(len(self.classes)):
            cls_indices = self.indices_per_class[cls_idx]
            sampled = np.random.choice(cls_indices, self.max_count, replace=True)
            batch_paths.extend([self.file_paths[i] for i in sampled])
            batch_labels.extend([cls_idx] * len(sampled))

        # Shuffle batch
        combined = list(zip(batch_paths, batch_labels))
        random.shuffle(combined)
        batch_paths, batch_labels = zip(*combined)

        # Take only batch_size images
        batch_paths = batch_paths[:self.batch_size]
        batch_labels = batch_labels[:self.batch_size]

        # Load images and apply augmentation if any
        batch_x = []
        for p in batch_paths:
            img = Image.open(p).convert("RGB").resize(self.img_size)
            img_array = np.array(img) / 255.0
            if self.augmentor:
                img_array = self.augmentor.random_transform(img_array)
            batch_x.append(img_array)

        return np.array(batch_x), np.array(batch_labels)

# =========================
# 3. Define Augmentor
# =========================
train_augmentor = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode="nearest"
)

# =========================
# 4. Create Balanced Generator
# =========================
balanced_train_generator = BalancedDataGenerator(
    directory=train_dir,
    classes=classes,
    batch_size=BATCH_SIZE,
    img_size=IMG_SIZE,
    augmentor=train_augmentor
)


In [38]:
import numpy as np

classes = ["No_Tumor", "Tumor"]
total_counts = np.zeros(len(classes), dtype=int)

for i in range(len(balanced_train_generator)):
    _, y_batch = balanced_train_generator[i]
    for lbl in y_batch:
        total_counts[lbl] += 1

for idx, cls in enumerate(classes):
    print(f"{cls}: {total_counts[idx]} effective samples in one epoch")


No_Tumor: 3879 effective samples in one epoch
Tumor: 3929 effective samples in one epoch
