<a href="https://colab.research.google.com/github/fikrifaizz/Brain-Tumor-MRI-Classification/blob/main/notebooks/02_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
import shutil
import random
import numpy as np
import pandas as pd
from pathlib import Path
from PIL import Image
from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)

In [3]:
BASE_DIR = Path('../data')
RAW_DIR = BASE_DIR / 'raw' / 'Brain Tumor MRI Dataset'
PROCESSED_DIR = BASE_DIR / 'processed'

TRAIN_RATIO = 0.80
VAL_RATIO = 0.1
TEST_RATIO = 0.1

print(f"Raw data directory: {RAW_DIR}")
print(f"Processed data directory: {PROCESSED_DIR}")
print(f"Split ratios - Train: {TRAIN_RATIO}, Val: {VAL_RATIO}, Test: {TEST_RATIO}")

Raw data directory: ../data/raw/Brain Tumor MRI Dataset
Processed data directory: ../data/processed
Split ratios - Train: 0.8, Val: 0.1, Test: 0.1


In [5]:
train_dir = PROCESSED_DIR / 'train'
val_dir = PROCESSED_DIR / 'val'
test_dir = PROCESSED_DIR / 'test'

for folder in [train_dir, val_dir, test_dir]:
    os.makedirs(folder, exist_ok=True)

classes = [cls for cls in os.listdir(RAW_DIR) if os.path.isdir(os.path.join(RAW_DIR, cls))]

for cls in classes:
    cls_path = os.path.join(RAW_DIR, cls)
    images = os.listdir(cls_path)

    train_images, temp_images = train_test_split(images, test_size=0.2, random_state=42)

    val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=42)

    os.makedirs(os.path.join(train_dir, cls), exist_ok=True)
    os.makedirs(os.path.join(val_dir, cls), exist_ok=True)
    os.makedirs(os.path.join(test_dir, cls), exist_ok=True)

    for img in train_images:
        shutil.copy(os.path.join(cls_path, img), os.path.join(train_dir, cls, img))

    for img in val_images:
        shutil.copy(os.path.join(cls_path, img), os.path.join(val_dir, cls, img))

    for img in test_images:
        shutil.copy(os.path.join(cls_path, img), os.path.join(test_dir, cls, img))

print("Train-Validation-Test split Have Done!")

Train-Validation-Test split Have Done!


In [8]:
splits = {
    "train": train_dir,
    "val": val_dir,
    "test": test_dir,
}

data = []
valid_ext = ('.png', '.jpg', '.jpeg')

for split_name, split_dir in splits.items():
    for cls in os.listdir(split_dir):
        cls_path = os.path.join(split_dir, cls)
        if not os.path.isdir(cls_path):
            continue

        files = [
            f for f in os.listdir(cls_path)
            if f.lower().endswith(valid_ext)
        ]
        data.append({
            "split": split_name,
            "class": cls,
            "count": len(files),
        })

df = pd.DataFrame(data)

print("===== Total Images by Class by Split =====")
print(df.pivot(index="class", columns="split", values="count"))

print("\n===== Total Images by Split =====")
print(df.groupby("split")["count"].sum())


===== Total Images by Class by Split =====
split       test  train  val
class                       
Glioma       163   1296  162
Meningioma   165   1316  164
No Tumor     200   1600  200
Pituitary    176   1405  176

===== Total Images by Split =====
split
test      704
train    5617
val       702
Name: count, dtype: int64
