In [1]:
import os
%pwd
os.chdir("../")
%pwd


'd:\\Data Science\\END to END Proj\\BloodCellClassification'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    EOSINOPHIL_dirs: list
    LYMPHOCYTE_dir: list
    MONOCYTE_dirs: list
    NEUTROPHIL_dirs: list
    img_height: int
    img_width: int
    batch_size: int
    test_size: float
    val_size: float
    seed: int

In [3]:
from src.BloodCellClassifier.constant import *
from src.BloodCellClassifier.utils.common import read_yaml,create_directories 

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        
        return DataTransformationConfig(
            root_dir=Path(config.root_dir),
            EOSINOPHIL_dirs=[Path(x) for x in config.EOSINOPHIL_dirs],
            LYMPHOCYTE_dir=[Path(x) for x in config.LYMPHOCYTE_dir],
            MONOCYTE_dirs=[Path(x) for x in config.MONOCYTE_dirs],
            NEUTROPHIL_dirs=[Path(x) for x in config.NEUTROPHIL_dirs],
            img_height=self.params.IMG_HEIGHT,
            img_width=self.params.IMG_WIDTH,
            batch_size=self.params.BATCH_SIZE,
            test_size=self.params.TEST_SIZE,
            val_size=self.params.VAL_SIZE,
            seed=self.params.SEED
        )

In [13]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import pandas as pd
import numpy as np
from BloodCellClassifier import logger

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.class_labels = ['EOSINOPHIL', 'LYMPHOCYTE', 'MONOCYTE', 'NEUTROPHIL']
        os.makedirs(self.config.root_dir, exist_ok=True)
    def create_dataframe(self):
        try:
            filepaths = []
            labels = []
            
            # Map directory lists to their class labels
            dir_class_map = {
                'EOSINOPHIL': self.config.EOSINOPHIL_dirs,
                'LYMPHOCYTE': self.config.LYMPHOCYTE_dir,
                'MONOCYTE': self.config.MONOCYTE_dirs,
                'NEUTROPHIL': self.config.NEUTROPHIL_dirs
            }
            
            for class_name, dir_list in dir_class_map.items():
                for dir_path in dir_list:
                    for f in os.listdir(dir_path):
                        fpath = os.path.join(dir_path, f)
                        filepaths.append(fpath)
                        labels.append(class_name)
            
            # Create dataframe similar to original code
            Fseries = pd.Series(filepaths, name="filepaths")
            Lseries = pd.Series(labels, name="labels")
            bloodCell_df = pd.concat([Fseries, Lseries], axis=1)
            
            logger.info(f"Created dataframe with {len(bloodCell_df)} samples")
            logger.info("Class distribution:\n" + str(bloodCell_df["labels"].value_counts()))
            
            return bloodCell_df
            
        except Exception as e:
            logger.error(f"Error creating dataframe: {e}")
            raise e

    def split_data(self, df):
        try:
            # Split data like in original code
            train_images, test_images = train_test_split(
                df, 
                test_size=self.config.test_size, 
                random_state=self.config.seed
            )
            train_set, val_set = train_test_split(
                train_images, 
                test_size=self.config.val_size, 
                random_state=self.config.seed
            )
            
            logger.info(f"Train set size: {len(train_set)}")
            logger.info(f"Validation set size: {len(val_set)}")
            logger.info(f"Test set size: {len(test_images)}")
            
            return train_set, val_set, test_images
            
        except Exception as e:
            logger.error(f"Error splitting data: {e}")
            raise e

    def get_data_generators(self, train_set, val_set, test_images):
        try:
            # Create image generator like in original code
            image_gen = ImageDataGenerator(
                preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
            )
            
            train_gen = image_gen.flow_from_dataframe(
                dataframe=train_set,
                x_col="filepaths",
                y_col="labels",
                target_size=(self.config.img_height, self.config.img_width),
                color_mode='rgb',
                class_mode="categorical",
                batch_size=self.config.batch_size,
                shuffle=False
            )
            
            test_gen = image_gen.flow_from_dataframe(
                dataframe=test_images,
                x_col="filepaths",
                y_col="labels",
                target_size=(self.config.img_height, self.config.img_width),
                color_mode='rgb',
                class_mode="categorical",
                batch_size=self.config.batch_size,
                shuffle=False
            )
            
            val_gen = image_gen.flow_from_dataframe(
                dataframe=val_set,
                x_col="filepaths",
                y_col="labels",
                target_size=(self.config.img_height, self.config.img_width),
                color_mode='rgb',
                class_mode="categorical",
                batch_size=self.config.batch_size,
                shuffle=False
            )
            
            logger.info("Created data generators successfully")
            logger.info(f"Classes: {train_gen.class_indices}")
            
            return train_gen, val_gen, test_gen
            
        except Exception as e:
            logger.error(f"Error creating data generators: {e}")
            raise e

In [15]:
try:
    # Data Transformation Pipeline
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    
    # Create dataframe and split data
    bloodCell_df = data_transformation.create_dataframe()
    train_set, val_set, test_images = data_transformation.split_data(bloodCell_df)
    
    # Create generators
    train_gen, val_gen, test_gen = data_transformation.get_data_generators(
        train_set, val_set, test_images
    )
    
    # Save the split datasets (now the directory exists)
    train_set.to_csv(os.path.join(data_transformation_config.root_dir, "train_set.csv"), index=False)
    val_set.to_csv(os.path.join(data_transformation_config.root_dir, "val_set.csv"), index=False)
    test_images.to_csv(os.path.join(data_transformation_config.root_dir, "test_images.csv"), index=False)

except Exception as e:
    logger.exception(f"Error in data transformation pipeline: {e}")
    raise e

[2025-07-31 18:37:52,506: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-31 18:37:52,506: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-31 18:37:52,506: INFO: common: created directory at: artifacts]
[2025-07-31 18:37:52,583: INFO: 3887899035: Created dataframe with 9957 samples]
[2025-07-31 18:37:52,585: INFO: 3887899035: Class distribution:
labels
NEUTROPHIL    2499
EOSINOPHIL    2497
LYMPHOCYTE    2483
MONOCYTE      2478
Name: count, dtype: int64]
[2025-07-31 18:37:52,593: INFO: 3887899035: Train set size: 5575]
[2025-07-31 18:37:52,594: INFO: 3887899035: Validation set size: 1394]
[2025-07-31 18:37:52,594: INFO: 3887899035: Test set size: 2988]
Found 5575 validated image filenames belonging to 4 classes.
Found 2988 validated image filenames belonging to 4 classes.
Found 1394 validated image filenames belonging to 4 classes.
[2025-07-31 18:37:53,571: INFO: 3887899035: Created data generators successfully]
[2025-07-31 18:37:53,571: INFO