In [9]:
import matplotlib.pyplot as plt
from src import logger
from src.utils.common import read_yaml
from dataclasses import dataclass
from src.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
import cv2
import zipfile
import shutil
import gdown


In [5]:
import os
os.chdir("../")

In [10]:
@dataclass
class DataIngestionConfig:
    google_drive_url: str
    output_path: str
    extract_to: str

class ConfigurationManager:
    def __init__(self):
        self.config = read_yaml(CONFIG_FILE_PATH)
        self.params = read_yaml(PARAMS_FILE_PATH)

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        params = DataIngestionConfig(
            google_drive_url=self.config.data.google_drive_url,
            output_path=self.config.data.output_path,
            extract_to=self.config.data.extract_to
            )
        return params 
    
class DataIngestion:
    def __init__(self, config):
        self.config=config

    
    def download_and_extract_zip(self):
        """
        Downloads a ZIP file from Google Drive, extracts it, and removes any extra folder.
        
        Args:
            drive_url (str): Direct download link to the file on Google Drive.
            output_path (str): Local path to save the downloaded ZIP file.
            extract_to (str): Directory to extract the contents.
        """
        try:
            # Step 1: Download the ZIP file
            gdown.download(self.config.google_drive_url, 
                        self.config.output_path, quiet=False, fuzzy=True)
            print(f"Downloaded file saved to: {self.config.output_path}")
            
            # Step 2: Check if the file is a ZIP file
            if not zipfile.is_zipfile(self.config.output_path):
                raise zipfile.BadZipFile("Downloaded file is not a valid ZIP file.")
            
            # Step 3: Extract the ZIP file
            with zipfile.ZipFile(self.config.output_path, 'r') as zip_ref:
                zip_ref.extractall(self.config.extract_to)
            print(f"Extracted contents to: {self.config.extract_to}")
            
            # Step 4: Move files if an extra folder is created
            for root, dirs, files in os.walk(self.config.extract_to):
                # If there's an extra folder, move its contents up and remove the folder
                for folder in dirs:
                    folder_path = os.path.join(root, folder)
                    for file in os.listdir(folder_path):
                        shutil.move(os.path.join(folder_path, file), self.config.extract_to)
                    os.rmdir(folder_path)  # Remove the now-empty folder
                break  # Only process the top-level directory

        except zipfile.BadZipFile as e:
            print(e)
        except Exception as e:
            print(f"An error occurred: {e}")

  



if __name__ == "__main__":
    config_manager = ConfigurationManager()
    config_params = config_manager.get_data_ingestion_config()
    ob = DataIngestion(config_params)
    ob.download_and_extract_zip()




2025-01-13 17:35:19,706 - common - INFO - Yaml read successfully from config/config.yaml
2025-01-13 17:35:19,707 - common - INFO - Yaml read successfully from params.yaml
2025-01-13 17:35:19,708 - common - ERROR - Exception occured while reading yaml file from                         location: params.yaml
 First argument must be mapping or iterable
2025-01-13 17:35:19,760 - connectionpool - DEBUG - Starting new HTTPS connection (1): drive.google.com:443
2025-01-13 17:35:21,852 - connectionpool - DEBUG - https://drive.google.com:443 "GET /uc?id=10ApjOcTo6tjO34q5et-ij-tbrSm0Waqc HTTP/11" 200 None
2025-01-13 17:35:22,212 - connectionpool - DEBUG - https://drive.google.com:443 "GET /uc?id=10ApjOcTo6tjO34q5et-ij-tbrSm0Waqc&confirm=t&uuid=b92d345a-2ef1-4381-919b-3bebdaa383d8 HTTP/11" 303 0
2025-01-13 17:35:25,439 - connectionpool - DEBUG - Starting new HTTPS connection (1): doc-14-bg-docs.googleusercontent.com:443
2025-01-13 17:35:27,774 - connectionpool - DEBUG - https://doc-14-bg-docs.goog

Downloaded file saved to: data/raw/data.zip
Extracted contents to: data/interim


In [4]:
config_manager.config