In [9]:
import os

In [10]:
os.chdir("C:\\Users\\user\\Desktop\\BHS\\Coding\\3_ML_Ops_Pipeline_AWS")

In [11]:
%pwd

'C:\\Users\\user\\Desktop\\BHS\\Coding\\3_ML_Ops_Pipeline_AWS'

In [12]:
# src\cnnClassifier\entity\config_entity.py
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path
    storage_format: str 

In [13]:
from cnnClassifier.constants import *
from cnnClassifier.utils.common import read_yaml, create_directories

In [14]:
# src\cnnClassifier\config\configuration.py
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir,
            storage_format=self.params.get("STORAGE_FORMAT", "original")
        )

        return data_ingestion_config
      

In [15]:
import os
import zipfile
import gdown
from cnnClassifier import logger
from cnnClassifier.utils.common import get_size
from pathlib import Path
import shutil

In [33]:
# src\cnnClassifier\components\data_ingestion.py
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

     
    def download_file(self)-> str:
        '''
        Fetch data from the url
        '''

        try: 
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            os.makedirs("artifacts/data_ingestion", exist_ok=True)
            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            #file_id = dataset_url.split("/")[-2]
            #prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(dataset_url,zip_download_dir)

            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")

        except Exception as e:
            raise e
        
    
    def extract_zip_file(self):
        """
        Extracts the downloaded zip file into the target directory.
        """
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(self.config.unzip_dir)
        print(f"Extracted zip file to {self.config.unzip_dir}.")


    def organize_reformat(self):
        """
        Organizes extracted dataset into the correct flat structure:
        - artifacts/data_ingestion/DATENSATZ_NAME/class1/
        - artifacts/data_ingestion/DATENSATZ_NAME/class2/
        """
        try:
            # Base directory (e.g., artifacts/data_ingestion/extracted)
            base_dir = self.config.unzip_dir

            # Extract dataset name (e.g., DATASET_NAME)
            dataset_name = os.listdir(base_dir)[0]
            dataset_dir = os.path.join(base_dir, dataset_name)

            if not os.path.isdir(dataset_dir):
                raise ValueError(f"Dataset directory {dataset_dir} does not exist!")

            # Target directory for the reorganized structure
            target_dir = os.path.join(self.config.root_dir, dataset_name)
            os.makedirs(target_dir, exist_ok=True)

            # Directories to process
            train_dir = os.path.join(dataset_dir, "train")
            valid_dir = os.path.join(dataset_dir, "validation")

            for sub_dir in [train_dir, valid_dir]:
                if not os.path.exists(sub_dir):
                    raise ValueError(f"Expected directory {sub_dir} does not exist!")

                # Iterate through classes in train and valid directories
                for class_name in os.listdir(sub_dir):
                    class_path = os.path.join(sub_dir, class_name)
                    if os.path.isdir(class_path):
                        # Target directory for this class
                        class_target_dir = os.path.join(target_dir, class_name)
                        os.makedirs(class_target_dir, exist_ok=True)

                        # Move all files from class_path to the target directory
                        for file_name in os.listdir(class_path):
                            src_file_path = os.path.join(class_path, file_name)
                            dst_file_path = os.path.join(class_target_dir, file_name)

                            # Move file to the reformatted structure
                            shutil.move(src_file_path, dst_file_path)
                            #logger.info(f"Moved file {src_file_path} to {dst_file_path}")
            
            # Cleanup: Remove extracted directory and its contents
            shutil.rmtree(base_dir)
            logger.info(f"Deleted extracted directory {base_dir} and its contents.")
            logger.info(f"Dataset successfully reorganized to target structure at {target_dir}.")

        except Exception as e:
            logger.error(f"Error while reorganizing dataset: {e}")
            raise e

    def organize(self):
        """
        Organizes the dataset based on the specified storage format.
        """
        if self.config.storage_format == "reformat":
            self.organize_reformat()
        elif self.config.storage_format == "original":
            print("Dataset is already in the correct format. Skipping reformatting.")
        else:
            raise ValueError(f"Unsupported storage format: {self.config.storage_format}")



In [34]:
# src\cnnClassifier\pipeline\stage_01_data_ingestion.py
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
    data_ingestion.organize()
except Exception as e:
    logger.error(f"Error during data ingestion: {e}")
    raise e

[2024-11-27 18:15:11,401: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-11-27 18:15:11,415: INFO: common: yaml file: params.yaml loaded successfully]
[2024-11-27 18:15:11,416: INFO: common: created directory at: artifacts]
[2024-11-27 18:15:11,417: INFO: common: created directory at: artifacts/data_ingestion]
[2024-11-27 18:15:11,418: INFO: 634175990: Downloading data from https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip into file artifacts/data_ingestion/data.zip]


Downloading...
From: https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
To: C:\Users\user\Desktop\BHS\Coding\3_ML_Ops_Pipeline_AWS\artifacts\data_ingestion\data.zip
100%|██████████| 68.6M/68.6M [00:01<00:00, 40.9MB/s]


[2024-11-27 18:15:16,405: INFO: 634175990: Downloaded data from https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip into file artifacts/data_ingestion/data.zip]
Extracted zip file to artifacts/data_ingestion/extracted.
Dataset is already in the correct format. Skipping reformatting.
