In [1]:
import os
%pwd
os.chdir("../")
%pwd

'd:\\Data Science\\END to END Proj\\KidneyDiseaseMLOPS'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    processed_data_file: Path
    image_size: int


In [3]:
from KidneyClassifier.utils.common import read_yaml, create_directories
from pathlib import Path
from KidneyClassifier.constant import*
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        config = self.config.data_preprocessing
        create_directories([config.root_dir])

        return DataPreprocessingConfig(
            root_dir=Path(config.root_dir),
            processed_data_file=Path(config.processed_data_file),
            image_size=config.image_size
        )


In [4]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
from KidneyClassifier.utils.common import*

# Define class names explicitly here
CLASS_NAMES = ['Cyst', 'Normal', 'Stone', 'Tumor']

class DataPreprocessing:
    def __init__(self, config, data_dir: str):
        self.config = config
        self.data_dir = data_dir
        create_directories([self.config.root_dir])

    def load_and_preprocess_images(self):
        images = []
        labels = []

        for idx, class_name in enumerate(CLASS_NAMES):
            class_path = os.path.join(self.data_dir, class_name)
            if not os.path.exists(class_path):
                raise FileNotFoundError(f"{class_path} not found")

            for img_name in tqdm(os.listdir(class_path), desc=f"Loading {class_name}"):
                img_path = os.path.join(class_path, img_name)
                img = cv2.imread(img_path)
                if img is None:
                    continue
                img = cv2.resize(img, (self.config.image_size, self.config.image_size))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = img / 255.0
                images.append(img)
                labels.append(idx)

        return np.array(images), np.array(labels)

    def apply_smote_and_split(self, X, y):
        X_flat = X.reshape(len(X), -1)
        smote = SMOTE()
        X_resampled, y_resampled = smote.fit_resample(X_flat, y)
        X_resampled = X_resampled.reshape(-1, self.config.image_size, self.config.image_size, 3)

        X_train, X_test, y_train, y_test = train_test_split(
            X_resampled, y_resampled, test_size=0.2, random_state=42
        )

        np.savez_compressed(self.config.processed_data_file, 
                            X_train=X_train, X_test=X_test, 
                            y_train=y_train, y_test=y_test)
        return X_train, X_test, y_train, y_test


In [6]:
from KidneyClassifier.config.configuration import ConfigurationManager

STAGE_NAME = "Data Preprocessing"

def main():
    config = ConfigurationManager()
    preprocessing_config = config.get_data_preprocessing_config()
    
    # Path to folder containing subfolders for each class
    raw_data_path = Path(config.get_data_ingestion_config().unzip_dir) / "CT-KIDNEY-DATASET"

    data_preprocessor = DataPreprocessing(preprocessing_config, data_dir=str(raw_data_path))
    X, y = data_preprocessor.load_and_preprocess_images()
    X_train, X_test, y_train, y_test = data_preprocessor.apply_smote_and_split(X, y)
    print(f"Preprocessing complete. Train shape: {X_train.shape}, Test shape: {X_test.shape}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        raise e


[2025-07-24 20:31:08,348: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-24 20:31:08,348: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-24 20:31:08,348: INFO: common: created directory at: artifacts]
[2025-07-24 20:31:08,363: INFO: common: created directory at: artifacts/data_preprocessing]
[2025-07-24 20:31:08,363: INFO: common: created directory at: artifacts/data_ingestion]
[2025-07-24 20:31:08,372: INFO: common: created directory at: artifacts\data_preprocessing]


Loading Cyst: 100%|██████████| 3709/3709 [01:25<00:00, 43.36it/s]
Loading Normal: 100%|██████████| 5077/5077 [01:53<00:00, 44.78it/s]
Loading Stone: 100%|██████████| 1377/1377 [00:31<00:00, 44.21it/s]
Loading Tumor: 100%|██████████| 2283/2283 [00:47<00:00, 48.14it/s]


Preprocessing complete. Train shape: (16246, 28, 28, 3), Test shape: (4062, 28, 28, 3)
