In [1]:
import os

In [2]:
%pwd

'd:\\project\\Predict-Lung-Disease\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\project\\Predict-Lung-Disease'

In [10]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataSplitConfig:
    root_dir: Path
    train_dir: Path
    val_dir: Path
    test_dir: Path
    split_ratios: tuple

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [6]:
from src.cnnClassifier.constants import *
from src.cnnClassifier.utils.common import read_yaml, create_directories

In [11]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params= read_yaml(params_filepath)

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config

    def get_data_split_config(self) -> DataSplitConfig:
        config = self.config.data_split
        create_directories([config.root_dir, config.train_dir, config.val_dir, config.test_dir])

        data_split_config = DataSplitConfig(
            root_dir=Path(config.root_dir),
            train_dir=Path(config.train_dir),
            val_dir=Path(config.val_dir),
            test_dir=Path(config.test_dir),
            split_ratios=tuple(config.split_ratios)
        )
        return data_split_config

In [17]:
import os
import shutil
from sklearn.model_selection import train_test_split
from src.cnnClassifier import logger

In [18]:
class DataSplitter:
    def __init__(self, config: DataSplitConfig):
        self.config = config

    def split_data(self, source_dir):
        classes = os.listdir(source_dir)  # Các thư mục con (class)
        train_dir = self.config.train_dir
        val_dir = self.config.val_dir
        test_dir = self.config.test_dir

        for class_name in classes:
            class_path = os.path.join(source_dir, class_name)
            if os.path.isdir(class_path):  # Kiểm tra nếu là thư mục
                files = os.listdir(class_path)  # Lấy danh sách file trong lớp
                if len(files) < 2:  # Nếu số file quá ít, bỏ qua
                    logger.warning(f"Lớp {class_name} có ít hơn 2 file, bỏ qua chia dữ liệu.")
                    continue

                # Chia dữ liệu thành train, val, test
                train_files, temp_files = train_test_split(
                    files, test_size=1 - self.config.split_ratios[0], random_state=42
                )
                val_files, test_files = train_test_split(
                    temp_files,
                    test_size=self.config.split_ratios[2] / (self.config.split_ratios[1] + self.config.split_ratios[2]),
                    random_state=42
                )

                # Copy file vào các thư mục tương ứng
                self._copy_files(class_path, train_files, os.path.join(train_dir, class_name))
                self._copy_files(class_path, val_files, os.path.join(val_dir, class_name))
                self._copy_files(class_path, test_files, os.path.join(test_dir, class_name))

    @staticmethod
    def _copy_files(source_class_dir, files, dest_class_dir):
        os.makedirs(dest_class_dir, exist_ok=True)  # Tạo thư mục đích nếu chưa tồn tại
        for file in files:
            src_path = os.path.join(source_class_dir, file)
            dest_path = os.path.join(dest_class_dir, file)
            shutil.copy(src_path, dest_path)  # Sao chép file

In [16]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_split_config = config.get_data_split_config()

    data_splitter = DataSplitter(config=data_split_config)

    source_dir = os.path.join(data_ingestion_config.unzip_dir, "Lung X-Ray Image")
    data_splitter.split_data(source_dir)
except Exception as e:
    raise e

[2024-11-17 10:48:01,723: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-11-17 10:48:01,726: INFO: common: yaml file: params.yaml loaded successfully]
[2024-11-17 10:48:01,728: INFO: common: creating directory at: artifacts/data_ingestion]
[2024-11-17 10:48:01,730: INFO: common: creating directory at: artifacts/data_split]
[2024-11-17 10:48:01,731: INFO: common: creating directory at: artifacts/data_split/train]
[2024-11-17 10:48:01,733: INFO: common: creating directory at: artifacts/data_split/val]
[2024-11-17 10:48:01,737: INFO: common: creating directory at: artifacts/data_split/test]
