In [1]:
import os

os.chdir("../")

%pwd

'd:\\work\\cifar100classifier-deep-learning'

In [4]:
import sys

sys.path.append('src')

%pwd

'd:\\work\\cifar100classifier-deep-learning'

In [5]:
from pathlib import Path
from dataclasses import dataclass

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_url: str
    local_data_file: Path
    unzip_dir: Path
    trainset_file: Path
    meta_file: Path
    metadata: str

In [6]:
from CNNClassifier.constants import *
from CNNClassifier.utils.utilities import read_yaml, create_directory

In [6]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directory([self.config.artifacts_root])

    def get_data_ingestion_config(self):
        config = self.config.data_ingestion
        create_directory([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_url=config.source_url,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir,
            trainset_file=config.trainset_file,
            meta_file=config.meta_file,
            metadata=config.metadata
        )
        return data_ingestion_config


In [7]:
import os
import requests
import wget
import tarfile
import pickle
import pandas as pd

from collections import defaultdict
from CNNClassifier import logger
from CNNClassifier.entity import DataIngestionConfig
from CNNClassifier.utils.utilities import *


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            logger.info("trying to download file")
            download_url = self.config.source_url
            outfile_path = self.config.local_data_file
            try:
                response = requests.get(download_url, stream=True)
                if response.status_code == 200:
                    with open(outfile_path, 'wb') as f:
                        f.write(response.raw.read())
            except:
                wget.download(download_url, out=outfile_path)
            logger.info(f"Downloaded {outfile_path} file successfully!")
        else:
            # logger.info("file already exists")
            logger.info(f"File already exists of the size: {get_size(Path(self.config.local_data_file))}")
            return

    def unzip_targzfile(self):
        targzfile = self.config.local_data_file
        outfolder = self.config.unzip_dir
        # open file
        with tarfile.open(targzfile) as f:
            logger.info(f.getnames())
            # extract files
            f.extractall(outfolder)
        return

    @staticmethod
    def unpickle(file):
        with open(file, 'rb') as fo:
            dict = pickle.load(fo, encoding='latin1')
        return dict

    def get_metadata(self):
        train_set = self.unpickle(os.path.join(self.config.unzip_dir, self.config.trainset_file))
        meta_data = self.unpickle(os.path.join(self.config.unzip_dir, self.config.meta_file))
        # create a data records
        file_names = train_set['filenames']
        fine_labels = train_set['fine_labels']
        coarse_labels = train_set['coarse_labels']
        coarse_names = meta_data['coarse_label_names']
        fine_names = meta_data['fine_label_names']
        images = train_set['data']
        n_images = len(images)
        images = images.reshape(n_images, 3, 32, 32).transpose(0, 2, 3, 1)
        # create a dictionary
        image_dict = defaultdict(list)
        for i in range(n_images):
            img = images[i]
            image_dict['file_name'].append(file_names[i])
            image_dict['fine_labels'].append(fine_labels[i])
            image_dict['coarse_labels'].append(coarse_labels[i])
            image_dict['fine_label_names'].append(fine_names[fine_labels[i]])
            image_dict['coarse_label_names'].append(coarse_names[coarse_labels[i]])
            image_dict['image_height'].append(img.shape[0])
            image_dict['image_width'].append(img.shape[1])
            image_dict['image_channel'].append(img.shape[2])
            image_dict['min_pixel'].append(img.min())
            image_dict['max_pixel'].append(img.max())
        # dict to dataframe
        df = pd.DataFrame.from_dict(image_dict)
        # save the result for EDA analysis
        df.to_csv(os.path.join(self.config.root_dir, self.config.metadata), index=False)
        return


In [8]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.unzip_targzfile()
    data_ingestion.get_metadata()
except Exception as e:
    raise e
    

[2023-10-20 14:18:45,209: INFO: utilities]: yaml file: configs\config.yaml loaded successfully
[2023-10-20 14:18:45,217: INFO: utilities]: yaml file: params.yaml loaded successfully
[2023-10-20 14:18:45,220: INFO: utilities]: create directory at: artifacts
[2023-10-20 14:18:45,222: INFO: utilities]: create directory at: artifacts/data_ingestion
[2023-10-20 14:18:45,224: INFO: 1076953943]: trying to download file
[2023-10-20 14:19:47,224: INFO: 1076953943]: Downloaded artifacts/data_ingestion/cifar-100-python.tar.gz file successfully!
[2023-10-20 14:19:48,899: INFO: 1076953943]: ['cifar-100-python', 'cifar-100-python/file.txt~', 'cifar-100-python/train', 'cifar-100-python/test', 'cifar-100-python/meta']
