In [1]:
!pip install sklearn



In [2]:
from collections import namedtuple
import os
from sklearn.model_selection import StratifiedShuffleSplit
os.chdir("../")

ModuleNotFoundError: No module named 'sklearn'

In [None]:
pwd

'd:\\projects\\credit_default_ml_project'

In [None]:
DataIngestionConfig = namedtuple("DataIngestionConfig", [
    "root_dir",
    "source_URL",
    "raw_data_dir",
    "local_data_file",
    "kaggle_file_path",
    "unzip_dir",
    "ingested_train_dir",
    "ingested_test_dir"
])

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    raw_data_dir: Path
    local_data_file: Path
    kaggle_file_path: Path
    unzip_dir: Path
    ingested_train_dir: Path
    ingested_test_dir: Path

In [None]:
from Credit_Default.constants import *
from Credit_Default.utils import *

In [None]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        
        create_directories([config.root_dir,
                            config.raw_data_dir,
                            config.unzip_dir,
                            config.ingested_train_dir,
                            config.ingested_test_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            raw_data_dir=config.raw_data_dir,
            local_data_file=config.local_data_file,
            kaggle_file_path=config.kaggle_file_path,
            unzip_dir=config.unzip_dir,
            ingested_train_dir=config.ingested_train_dir,
            ingested_test_dir=config.ingested_test_dir
        )

        return data_ingestion_config

In [None]:
import os
import urllib.request as request
from pathlib import Path
from zipfile import ZipFile
from kaggle.api.kaggle_api_extended import KaggleApi
import numpy as np


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
    
    def load_kaggle(self) :
        kaggle=load_json(Path(self.config.kaggle_file_path))
        os.environ['KAGGLE_USERNAME']=kaggle.username
        os.environ['KAGGLE_KEY']=kaggle.key
        url=self.config.source_URL.split('/')
        api = KaggleApi()
        api.authenticate()
        api.dataset_download_file("/".join(url[-3:-1]),url[-1],self.config.raw_data_dir)
       

    def unzip(self):
        with ZipFile(file=self.config.local_data_file, mode="r") as zf:
            zf.extractall(self.config.unzip_dir)
    
    def basic(self):
        file_name = os.listdir(self.config.unzip_dir)[0]
        df_file_path = os.path.join(self.config.unzip_dir,file_name)
        df=pd.read_csv(df_file_path)
        min1,max1=df['LIMIT_BAL'].min(),df['LIMIT_BAL'].max()
        df['Limit_Bal_cat']=pd.cut(
                df["LIMIT_BAL"],
                bins=[min1, 0.2*max1, 0.4*max1, 0.6*max1, np.inf],
                labels=[1,2,3,4]
            )

        strat_train_set = None
        strat_test_set = None

        split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        for train_index,test_index in split.split(df,df['Limit_Bal_cat']):
            strat_train_set = df.loc[train_index].drop(['Limit_Bal_cat'],axis=1)
            strat_test_set= df.loc[test_index].drop(['Limit_Bal_cat'],axis=1)

        if strat_train_set is not None:
            strat_train_set.to_csv(self.config.ingested_train_dir,index=False)
        if strat_train_set is not None:
            strat_train_set.to_csv(self.config.ingested_train_dir,index=False)


ModuleNotFoundError: No module named 'sklearn'

In [None]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.load_kaggle()
    data_ingestion.unzip()
    data_ingestion.basic()

except Exception as e:
    raise e