### Stage 1: Data Ingestion

- [ ] Define Configuration for Interacting with Kaggle Public API
- [ ] Download Kaggle Dataset using Kaggle Credentials and Save Data to `data` directory
- [ ] Create a Pipeline that automates data ingestion for any publically available Kaggle Dataset


#### Configuration

In [1]:
import os
os.chdir('../')
print(f'Current Working Directory: {os.getcwd()}')

Current Working Directory: /mnt/e/Projects/DeepGlobeRoadExtraction


In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    # Kaggle Credentials from secrets.yaml
    username: str
    token: str
    # config.yaml
    download_dir: Path
    dataset_id: str

from DeepGlobeRoadExtraction import CONFIG_FILE_PATH, SECRETS_FILE_PATH
from DeepGlobeRoadExtraction.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, secrets_filepath = SECRETS_FILE_PATH) -> None:
        self.config = read_yaml(config_filepath)
        self.secrets = read_yaml(secrets_filepath)
        create_directories([self.config.data_ingestion.download_dir])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        secrets = self.secrets.kaggle
        cfg = DataIngestionConfig(
            download_dir=Path(config.download_dir),
            dataset_id=config.dataset_id,
            username=secrets.username,
            token=secrets.token
        )
        return cfg
    
cfg = ConfigurationManager().get_data_ingestion_config()
cfg

[2024-06-29 19:31:16,323: INFO: common: yaml file: config.yaml loaded successfully]
[2024-06-29 19:31:16,332: INFO: common: yaml file: secrets.yaml loaded successfully]
[2024-06-29 19:31:16,334: INFO: common: created directory at: data]


DataIngestionConfig(username='adityasharma47', token='7a55ff734d90892c9d530089ea61e0f7', download_dir=PosixPath('data'), dataset_id='balraj98/deepglobe-road-extraction-dataset')

#### Components

In [3]:
import os
import subprocess
import json
from DeepGlobeRoadExtraction import logger
from pathlib import Path

class DataIngestionComponents:
    def __init__(self, config: DataIngestionConfig) -> None:
        self.config = config

    def initialise_kaggle(self):
        logger.info(f'---------- Initialising Kaggle Account ----------')
        # Set Path for Kaggle Configration File
        KAGGLE_CONFIG_DIR = os.path.join(os.path.expandvars('$HOME'), '.kaggle')
        KAGGLE_CONFIG_FILE = os.path.join(KAGGLE_CONFIG_DIR, 'kaggle.json')
        
        # Check if kaggle.json already exists and is not empty
        if os.path.exists(KAGGLE_CONFIG_FILE) and os.path.getsize(KAGGLE_CONFIG_FILE) > 0:
            logger.warning(f'---> Kaggle Account Credentials Found ==> {KAGGLE_CONFIG_FILE}. Remove this file and re-initialse if API token is invalid or has expired.')
            return
        
        # Otherwise create .kaggle directory
        os.makedirs(KAGGLE_CONFIG_DIR, exist_ok=True)
        
        try:
            username = self.config.username
            token = self.config.token
            api_dict = {'username': username, 'key': token}
            
            # Create a kaggle.json file inside .kaggle folder and add your credentials
            with open(KAGGLE_CONFIG_FILE, "w", encoding="utf-8") as f:
                json.dump(api_dict, f)
            
            # Change File Permissions
            cmd = f"chmod 600 {KAGGLE_CONFIG_FILE}"
            output = subprocess.check_output(cmd.split(" "))
            output = output.decode(encoding="utf-8")
        except Exception as e:
            logger.error('Failed to Initialise Kaggle Account!')
            raise e
        
    # Download Kaggle Dataset
    def download_dataset(self):
        if Path(self.config.download_dir).is_dir() and 'metadata.csv' in os.listdir(self.config.download_dir):
            # If download directory exists and contains metadata.csv, skip download
            logger.info(f'---> Data directory already exists. Skipping download.')
            return
        
        from kaggle.api.kaggle_api_extended import KaggleApi
        logger.info(f'---------- Downloading Kaggle Dataset: {self.config.dataset_id} ----------')
        try:
            api = KaggleApi()
            api.authenticate()
            api.dataset_download_files(
                dataset=self.config.dataset_id,
                path=self.config.download_dir,
                unzip=True,
                force=False,
                quiet=True
            )
            logger.info('---> Download Complete!')
        except Exception as e:
            logger.error('Kaggle dataset download failed!')
            raise e

#### Pipeline

In [4]:
class DataIngestionPipeline:
    def __init__(self) -> None:
        pass
    
    def main(self):
        config = ConfigurationManager().get_data_ingestion_config()
        pipeline = DataIngestionComponents(config=config)
        pipeline.initialise_kaggle()
        pipeline.download_dataset()

In [5]:
DataIngestionPipeline().main()

[2024-06-29 19:33:40,293: INFO: common: yaml file: config.yaml loaded successfully]
[2024-06-29 19:33:40,296: INFO: common: yaml file: secrets.yaml loaded successfully]
[2024-06-29 19:33:40,298: INFO: common: created directory at: data]
[2024-06-29 19:33:40,298: INFO: 688282968: ---------- Initialising Kaggle Account ----------]
[2024-06-29 19:33:40,304: INFO: 688282968: ---> data already exists. Skipping download.]


### Stage 2: Data Prepparation

- [ ] Read original `metadata.csv` file,
- [ ] Split training images into training, test, and validation sets, and
- [ ] Saving the updated metadata information inside the `data` folder


In [1]:
import os
os.chdir('../')
print(f'Current Working Directory: {os.getcwd()}')

Current Working Directory: /mnt/e/Projects/DeepGlobeRoadExtraction


In [3]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class DataPrepConfig:
    # params.yaml
    random_state: int
    train_val_test_split_ratio: List[float]
    # config.yaml
    data_dir: Path
    metadata_path: Path
    processed_metadata_path: Path

from DeepGlobeRoadExtraction import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from DeepGlobeRoadExtraction.utils.common import read_yaml

class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH) -> None:
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

    
    def get_data_prep_config(self) -> DataPrepConfig:
        config = self.config.data_preparation
        params = self.params
        cfg = DataPrepConfig(
            random_state=params.random_state,
            train_val_test_split_ratio=params.train_val_test_split_ratio,
            data_dir=config.data_dir,
            metadata_path=config.metadata_path,
            processed_metadata_path=config.processed_metadata_path
        )
        return cfg
    
cfg = ConfigurationManager().get_data_prep_config()
cfg

[2024-06-29 17:32:52,223: INFO: common: yaml file: config.yaml loaded successfully]
[2024-06-29 17:32:52,226: INFO: common: yaml file: params.yaml loaded successfully]


DataPrepConfig(random_state=26, train_val_test_split_ratio=BoxList([0.8, 0.1, 0.1]), data_dir='data', metadata_path='data/metadata.csv', processed_metadata_path='data/processed_metadata.csv')

In [11]:
# Components
from DeepGlobeRoadExtraction import logger
import pandas as pd
import os
from sklearn.model_selection import train_test_split

class DataPrepComponents:
    def __init__(self, config: DataPrepConfig) -> None:
        self.config = config
    
    def load_metadata(self) -> pd.DataFrame:
        logger.info(f'---------- Loading Metadata: {self.config.metadata_path} ----------')
        try:
            metadata_df = pd.read_csv(self.config.metadata_path)
            metadata_df = metadata_df[metadata_df['split'] == 'train']
            metadata_df = metadata_df[['image_id', 'sat_image_path', 'mask_path']] # Select Columns
            metadata_df['sat_image_path'] = metadata_df['sat_image_path'].apply(lambda img_path: os.path.join(self.config.data_dir, img_path)) # Update image paths
            metadata_df['mask_path'] = metadata_df['mask_path'].apply(lambda img_path: os.path.join(self.config.data_dir, img_path)) # Update mask paths
            self.metadata_df = metadata_df
        except Exception as e:
            logger.error(f'Failed to load Metadata: {self.config.metadata_path}')
            logger.exception(e)
            
    def split_dataset(self):
        logger.info(f'---------- Splitting Training Data into Training, Testing, and Validation Sets ----------')
        try:
            metadata_df = self.metadata_df.sample(frac=1).reset_index(drop=True) # Shuffle DataFrame
            train_df, valid_df = train_test_split(metadata_df, train_size=self.config.train_val_test_split_ratio[0], random_state=self.config.random_state) # Split into training and val + test (combined)
            valid_df, test_df = train_test_split(valid_df, train_size=self.config.train_val_test_split_ratio[1]/(self.config.train_val_test_split_ratio[1] + self.config.train_val_test_split_ratio[2]), random_state=self.config.random_state) # Split val + test combined into val and test sets
            train_df['group'] = 'train'
            test_df['group'] = 'test'
            valid_df['group'] = 'val'
            # Concatenate DataFrames
            self.metadata = pd.concat([train_df, test_df, valid_df]) # Processed Metadata
            # Export as CSV
            self.metadata.to_csv(self.config.processed_metadata_path, index=False)
            del train_df, test_df, valid_df, metadata_df 
        except Exception as e:
            logger.info("Failed to Split Training Data")
            logger.exception(e)

In [12]:
# Pipeline
config = ConfigurationManager().get_data_prep_config()
pipeline = DataPrepComponents(config=config)
pipeline.load_metadata()
pipeline.split_dataset()


[2024-06-29 17:54:22,970: INFO: common: yaml file: config.yaml loaded successfully]
[2024-06-29 17:54:22,973: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-29 17:54:22,975: INFO: 4206599812: ---------- Loading Metadata: data/metadata.csv ----------]
[2024-06-29 17:54:22,997: INFO: 4206599812: ---------- Splitting Training Data into Training, Testing, and Validation Sets ----------]
