In [27]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("navoneel/brain-mri-images-for-brain-tumor-detection")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'brain-mri-images-for-brain-tumor-detection' dataset.
Path to dataset files: /kaggle/input/brain-mri-images-for-brain-tumor-detection


In [28]:
from pathlib import Path
import shutil

dataset_dir = Path("/kaggle/input/brain-mri-images-for-brain-tumor-detection")

new_dir = Path("/kaggle/preprocess/brain-mri-dataset")

shutil.copytree(dataset_dir, new_dir)


PosixPath('/kaggle/preprocess/brain-mri-dataset')

In [29]:
# Renamed File Extensions
yes_dir = new_dir / 'yes'
no_dir = new_dir / 'no'
total_dir = new_dir / 'brain_tumor_dataset'

extensions = ["*.jpg", "*.JPG", "*.jpeg"]

for ext in extensions:
  for path in new_dir.rglob(f'{ext}'):
    new_path = path.with_suffix('.jpg')

    path.rename(new_path)

In [43]:
# Exploratory Data Analysis

import os
from PIL import Image, ImageMath
import pandas as pd
import cv2
import numpy as np

img_data = []

# let yes = 1
# let no = 0

yes_files = [f for f in os.listdir(yes_dir)]
no_files = [f for f in os.listdir(no_dir)]

for f in yes_files:
  img = Image.open(os.path.join(yes_dir, f))
  img_arr = np.array(img)
  img_data.append({
      "Image": img_arr,
      "Label": 1
  })


for f in no_files:
  img = Image.open(os.path.join(no_dir, f))
  img_arr = np.array(img)
  img_data.append({
      "Image": img_arr,
      "Label": 0
  })

df = pd.DataFrame(img_data)

df.isnull().sum()
df['Label'].value_counts(normalize=True)
# there is a class imbalance, with more yes than no


Unnamed: 0_level_0,proportion
Label,Unnamed: 1_level_1
1,0.612648
0,0.387352


In [32]:
# preprocessing: resize, normalize, apply blur
# greyscale, 224 x 224

def preprocess(image: np.ndarray) -> np.ndarray[np.float32]:
    # resize 224 x 224
    resized = cv2.resize(image, (224, 224))
    # normalize 0 - 1
    normalized = resized.astype(np.float32) / 255.0
    # noise reduction
    blurred = cv2.GaussianBlur(normalized, (3, 3), 0)

    return blurred

# preprocessing: resize, normalize, apply blur
# greyscale, 224 x 224

# before training, we'd iterate through the df and preprocess

cv2.destroyAllWindows() # in case we called it

Preprocessed image at /kaggle/processed/yes/Y96.jpg
Preprocessed image at /kaggle/processed/yes/Y56.jpg
Preprocessed image at /kaggle/processed/yes/Y251.jpg
Preprocessed image at /kaggle/processed/yes/Y32.jpg
Preprocessed image at /kaggle/processed/yes/Y163.jpg
Preprocessed image at /kaggle/processed/yes/Y254.jpg
Preprocessed image at /kaggle/processed/yes/Y243.jpg
Preprocessed image at /kaggle/processed/yes/Y19.jpg
Preprocessed image at /kaggle/processed/yes/Y61.jpg
Preprocessed image at /kaggle/processed/yes/Y250.jpg
Preprocessed image at /kaggle/processed/yes/Y17.jpg
Preprocessed image at /kaggle/processed/yes/Y120.jpg
Preprocessed image at /kaggle/processed/yes/Y159.jpg
Preprocessed image at /kaggle/processed/yes/Y42.jpg
Preprocessed image at /kaggle/processed/yes/Y4.jpg
Preprocessed image at /kaggle/processed/yes/Y50.jpg
Preprocessed image at /kaggle/processed/yes/Y77.jpg
Preprocessed image at /kaggle/processed/yes/Y248.jpg
Preprocessed image at /kaggle/processed/yes/Y3.jpg
Prepro