In [None]:
import os
from dotenv import load_dotenv
import kaggle
import BreastCancerClassifier.constants
from BreastCancerClassifier.utils.common import read_yaml, create_directories

# Load environment variables from a file .env
load_dotenv()  # Be sure to set the .env file beforehand

# Get the path to the Kaggle API configuration directory from the .env file
kaggle_config_dir = os.getenv("KAGGLE_CONFIG_DIR")

if kaggle_config_dir:
    # Setting the environment variable for the Kaggle API configuration directory
    os.environ['KAGGLE_CONFIG_DIR'] = kaggle_config_dir
    print(f'Path to Kaggle API configuration directory: {kaggle_config_dir}')
else:
    raise ValueError("The KAGGLE_CONFIG_DIR variable is not set in the .env file")

def download_kaggle_dataset(dataset, download_path='.'):
    try:
        
        kaggle.api.authenticate()
        kaggle.api.dataset_download_files(dataset, path=download_path, unzip=True)
        print(f'Dataset {dataset} has been successfully downloaded and saved in: {download_path}')
    except Exception as e:
        print(f'Error while downloading: {e}')

# Function to check if the dataset has already been downloaded
def is_dataset_downloaded(download_path):
    if os.path.exists(download_path) and len(os.listdir(download_path)) > 0:
        print("Dataset is already downloaded.")
        return True
    else:
        print("Dataset is not downloaded. Proceeding to download.")
        return False

# Definition of the dataset and download path
data_set = 'ronanpickell/b200c-lego-classification-dataset'
download_path = 'data/lego-dataset'  # Path must comply with Linux/WSL

# Download dataset if not already downloaded
if not is_dataset_downloaded(download_path):
    api.dataset_download_files(dataset, path=download_path, unzip=True)
    print("Dataset downloaded successfully.")


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohamedhanyyy/chest-ctscan-images")

print("Path to dataset files:", path)

In [None]:
configuration.username = 'your_user_name'
configuration.password = 'your_password'
KAGGLE_CONFIG_DIR='/mnt/Your_Path/.kaggle' # For Ubuntu virtual enviroment
# KAGGLE_CONFIG_DIR='/Your_Path/.kaggle' # for Windows virtual enviroment 

In [None]:
from pathlib import Path

SCRIPT_DIR = Path().resolve()  # Resolves to the current working directory
CONFIG_FILE_PATH = SCRIPT_DIR / "config" / "config.yaml"
PARAMS_FILE_PATH = SCRIPT_DIR / "params.yaml"

print(SCRIPT_DIR)  # See what the current working directory is
print(CONFIG_FILE_PATH)


In [1]:
from pathlib import Path
import os
from box import ConfigBox  # For using Box configurations 

CONFIG_FILE_PATH = Path("config/config.yaml") # set constant path to config.yaml
PARAMS_FILE_PATH = Path("params.yaml") # set constant path to params.yaml

In [None]:
from dataclasses import dataclass
from pathlib import Path
# Create a configuration manager for data ingestion
from BreastCancerClassifier.utils.common import read_yaml, create_directories


@dataclass(frozen=True)
class DataIngestionConfig:
    """Data Ingestion configuration."""
    root_dir: Path
    kaggle_source: str


In [12]:
config_filepath = CONFIG_FILE_PATH
config = read_yaml(config_filepath)
type(config)


[2025-02-25 13:11:10,090]: INFO: common: read_yaml: yaml file: config/config.yaml loaded successfully!


box.config_box.ConfigBox

In [None]:
from pathlib import Path
from BreastCancerClassifier.utils.common import read_yaml, create_directories

class ConfigurationManager:
    """Manages project configurations, including data ingestion settings."""

    def __init__(
        self,
        config_filepath:  Path = CONFIG_FILE_PATH, # Path to configuration file - importet from constants
        params_filepath:  Path = PARAMS_FILE_PATH  # Path to parameters file - importet from constants
    ) -> None:
        """
        Initializes the ConfigurationManager.

        Args:
            config_filepath: Path to the configuration YAML file.
            params_filepath: Path to the parameters YAML file.
        """


        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        #Check if artifacts_root exist inside self.config
        if 'artifacts_root' not in self.config:
          raise KeyError("'artifacts_root' key not found in the configuration file.")

        create_directories([self.config.artifacts_root]) #Use dictionary acces to avoid future errors.

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Retrieves and constructs the data ingestion configuration.

        Returns:
            A DataIngestionConfig object.

        Raises:
            KeyError: If necessary keys are missing from the configuration.
        """

        #Error Handling - Check if 'data_ingestion' exist before accessing.
        if 'data_ingestion' not in self.config:
            raise KeyError("'data_ingestion' key not found in the configuration file.")

        config = self.config.data_ingestion

        # Error Handling: Check if necessary keys are present within 'data_ingestion'
        required_keys = ['root_dir', 'kaggle_source']
        for key in required_keys:
            if key not in config:
                raise KeyError(f"'{key}' key not found in 'data_ingestion' section of the configuration file.")

        #Type validation
        if not isinstance(config['root_dir'], str):
             raise TypeError("'root_dir' in 'data_ingestion' must be a string.")
        if not isinstance(config['kaggle_source'], str):
             raise TypeError("'kaggle_source' in 'data_ingestion' must be a string.")

        create_directories([config.root_dir])

        #Adding default parameters.
        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),
            kaggle_source=config.kaggle_source,
        )

        return data_ingestion_config


In [4]:
os.chdir("../")
%pwd

'/mnt/c/Users/DataScience/Documents/GitHub/Chest_Cancer_Classification_MLOps'

In [16]:
import os
import dotenv
import kaggle
from BreastCancerClassifier import logger
from BreastCancerClassifier.utils.common import get_size


class DataIngestion:
    """
    Handles the downloading of datasets for the Breast Cancer Classifier project,
    including validations, logging, and data size calculations.
    """

    def __init__(self, config):
        """
        Initialization with configuration provided.

        :param config: Configuration object for data ingestion.
        """
        self.config = config

    def is_dataset_downloaded(self) -> bool:
        """
        Check if the dataset has already been downloaded.

        :return: True if dataset exists and is non-empty, False otherwise.
        """
        if os.path.exists(self.config.root_dir) and len(os.listdir(self.config.root_dir)) > 0:
            logger.info("Dataset is already downloaded.")
            return True
        else:
            logger.info("Dataset is not downloaded. Proceeding to download.")
            return False

    def validate_kaggle_credentials(self) -> None:
        """
        Validate Kaggle API credentials. Logs an error and raises an exception if credentials are missing.
        """
        dotenv.load_dotenv()

        kaggle_username = os.getenv("KAGGLE_USERNAME")
        kaggle_key = os.getenv("KAGGLE_KEY")

        if not kaggle_username or not kaggle_key:
            error_message = """
            Kaggle API credentials not found. Ensure the following environment variables are set:
              - KAGGLE_USERNAME
              - KAGGLE_KEY
            Add your credentials to a `.env` file or export them as environment variables.
            """
            logger.error(error_message)
            raise EnvironmentError(error_message)

        logger.info("Kaggle API credentials imported successfully.")

    def validate_config(self) -> None:
        """
        Validate the essential configurations from the config file.
        """
        if not self.config.kaggle_source:
            error_message = "The 'kaggle_source' variable is missing in the configuration file."
            logger.error(error_message)
            raise ValueError(error_message)

        if not os.path.exists(self.config.root_dir):
            logger.info(f"Root directory '{self.config.root_dir}' does not exist. Creating it.")
            os.makedirs(self.config.root_dir, exist_ok=True)

    def download_file(self) -> None:
        """
        Handles dataset download from Kaggle and ensures it is available locally.
        """
        logger.info("Starting dataset download process...")
        self.validate_config()

        # Check if dataset is already downloaded
        if self.is_dataset_downloaded():
            dataset_size = get_size(Path(self.config.root_dir))
            logger.info(f"Dataset already downloaded. Size: {dataset_size} kB")
            return

        # Validate Kaggle API credentials
        self.validate_kaggle_credentials()

        # Attempt to download the dataset
        try:
            logger.info(f"Downloading dataset from Kaggle: {self.config.kaggle_source}")
            kaggle.api.authenticate()
            kaggle.api.dataset_download_files(
                dataset=self.config.kaggle_source,
                path=self.config.root_dir,
                unzip=True
            )
            logger.info(f"Dataset '{self.config.kaggle_source}' has been successfully downloaded to '{self.config.root_dir}'.")
        except Exception as e:
            logger.error(f"Error occurred while downloading the dataset: {e}")
            raise

        # Verify download
        if self.is_dataset_downloaded():
            dataset_size = get_size(Path(self.config.root_dir))
            logger.info(f"Dataset download complete. Size: {dataset_size} kB.")
        else:
            error_message = f"Dataset download failed. The directory '{self.config.root_dir}' is empty."
            logger.error(error_message)
            raise RuntimeError(error_message)


In [8]:
data_ingestion_config.root_dir

'artifacts/data_ingestion/.kaggle'

In [14]:
get_size(Path(""))

'file size: ~ 4 KB'

In [18]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
except Exception as e:
    raise e

[2025-02-25 13:22:07,734]: INFO: common: read_yaml: yaml file: config/config.yaml loaded successfully!
<class 'box.config_box.ConfigBox'>
<class 'str'>
artifacts
[2025-02-25 13:22:07,736]: INFO: common: create_directories: Created directory at: artifacts
[2025-02-25 13:22:07,740]: INFO: common: create_directories: Created directory at: artifacts/data_ingestion/.kaggle
[2025-02-25 13:22:07,741]: INFO: 3773441251: download_file: Starting dataset download process...
[2025-02-25 13:22:07,746]: INFO: 3773441251: is_dataset_downloaded: Dataset is already downloaded.
[2025-02-25 13:22:07,748]: INFO: 3773441251: download_file: Dataset already downloaded. Size: file size: ~ 4 KB kB


In [31]:
from textwrap import dedent
STAGE_NAME = "Data Ingestion stage"


def start_stage_logger(stage_name: str, length: int = 40, symbol :str = "#") -> str:
    stage_start = dedent(f"""\
        {length * symbol}
        {stage_name.upper().center(lengthl)} 
        {length * symbol}
        """)
    return stage_start

print(start_stage_logger(STAGE_NAME))

NameError: name 'lengthl' is not defined

In [73]:
" abc ".join("efg")

'e abc f abc g'

In [None]:
from textwrap import dedent

STAGE_NAME = "Data Ingestion stage"


def start_stage_logger(stage_name: str, length: int = 40, symbol: str = "#") -> str:

    # ANSI escape codes for color
    GREEN = '\033[92m'  # Green

    stage_name_start = "".join([STAGE_NAME, " started"])
    
    stage_start = dedent(f"""\
        {GREEN}{length * symbol}
        {stage_name_start.upper().center(length," ")}
        {length * symbol}\
        """)
    return stage_start

print(start_stage_logger(STAGE_NAME))


[92m########################################
      DATA INGESTION STAGE STARTED      
########################################        
 Data Ingestion stagesData Ingestion stagetData Ingestion stageaData Ingestion stagerData Ingestion stagetData Ingestion stageeData Ingestion staged
