In [3]:
import os

In [4]:
os.chdir("../")

In [13]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    download_url: str
    local_data_file: Path
    unzip_dir: Path

In [14]:
from pathlib import Path
from textSummarizer.constants import CONFIG_FILE, PARAM_FILE
from textSummarizer.utils import read_yaml


class ConfigManager:
    def __init__(self, config_file: Path = CONFIG_FILE, param_file: Path = PARAM_FILE):
        self.config = read_yaml(config_file)
        self.param = read_yaml(param_file)

        Path(self.config.artifact_root).mkdir(parents=True, exist_ok=True)
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        return DataIngestionConfig(
            download_url =config.download_url, 
            local_data_file =Path(config.local_data_file), 
            unzip_dir =Path(config.unzip_dir), 
            )

In [20]:
import os
import urllib.request as request
import zipfile
from textSummarizer.logging import logger

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
    
    def download_data(self):
        self.config.local_data_file.parent.mkdir(parents=True, exist_ok=True)
        if not self.config.local_data_file.exists():
            filename, headers = request.urlretrieve(
                url=self.config.download_url,
                filename=self.config.local_data_file
            )
            logger.info(f"{filename} downloaded!")
        else:
            logger.info(f"{self.config.local_data_file} exists already!")
        

    def unzip_data(self):
        self.config.unzip_dir.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(self.config.unzip_dir)
            logger.info(f"unzip {self.config.unzip_dir}")

In [21]:
config_manager = ConfigManager()
data_ingestion_config = config_manager.get_data_ingestion_config()
data_ingestion = DataIngestion(data_ingestion_config)
data_ingestion.download_data()
data_ingestion.unzip_data()

2024-02-29 16:53:26,666 - INFO - __init__ - Reading configs/config.yaml ......
2024-02-29 16:53:26,737 - INFO - __init__ - Reading params/param.yaml ......
2024-02-29 16:53:26,742 - INFO - 3826109236 - artifacts/data_ingestion/data.zip exists already!
2024-02-29 16:53:27,163 - INFO - 3826109236 - unzip artifacts/data_ingestion
