In [1]:
import os

In [2]:
os.chdir("../")

In [None]:
%pwd

# First thing we do is setup our config.yaml which holds information about where our data will be stored

In [None]:
# Root directory for all artifacts
artifacts_root: artifacts

# Configuration related to data ingestion
data_ingestion:

  # Directory where data ingestion artifacts are stored
  root_dir: artifacts/data_ingestion

  # Path to the local file where the data is already saved
  local_data_file: /Users/macbookpro/Documents/Documents - Macbook’s MacBook Pro/thesis/thesis/data/gsearch_jobs.csv

# Define Entity for Data Types

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    """
    Configuration for data ingestion process.
    
    Attributes:
    - root_dir: Directory where data ingestion artifacts are stored.
    - local_data_file: Path to the local file where the data is already saved.
    """
    root_dir: Path
    local_data_file: Path


# Constants

In [4]:
from pathlib import Path

# Path to main configuration file
CONFIG_FILE_PATH = Path('config/config.yaml')

# Path to parameters file
PARAMS_FILE_PATH = Path('params.yaml')

# Path to scheme file
SCHEMA_FILE_PATH = Path('schema.yaml')

# Src Configuration Manager

In [5]:
from src.career_chief.constants import *
from src.career_chief.utils.common import read_yaml, create_directories
from src.career_chief import logger
from src.career_chief.entity.config_entity import (DataIngestionConfig)

class ConfigurationManager:
    """
    ConfigurationManager manages configurations needed for the data pipeline.

    The class reads configuration, parameter, and schema settings from specified files
    and provides a set of methods to access these settings. It also takes care of
    creating necessary directories defined in the configurations.

    Attributes:
    - config (dict): Configuration settings.
    - params (dict): Parameters for the pipeline.
    - schema (dict): Schema information.
    """
    
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "initial_schema")

        # Create the directory for storing artifacts if it doesn't exist
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read a configuration file and return its content.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Configuration settings.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise

    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Extract and return data ingestion configurations as a DataIngestionConfig object.

        This method fetches settings related to data ingestion, like directories and file paths,
        and returns them as a DataIngestionConfig object.

        Returns:
        - DataIngestionConfig: Object containing data ingestion configuration settings.

        Raises:
        - AttributeError: If the 'data_ingestion' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_ingestion
            # Create the root directory for data ingestion if it doesn't already exist
            create_directories([config.root_dir])
            
            return DataIngestionConfig(
                root_dir=Path(config.root_dir),
                local_data_file=Path(config.local_data_file),
            )

        except AttributeError as e:
            logger.error("The 'data_ingestion' attribute does not exist in the config file.")
            raise e

# Setup Components for Data Ingestion

In [6]:
import os
import shutil
import pandas as pd

from src.career_chief import logger
from src.career_chief.utils.common import get_size
from src.career_chief.entity.config_entity import DataIngestionConfig

class DataIngestion:
    """
    DataIngestion handles the process of transferring data from a local directory 
    to the project's official artifact directories.

    The class currently assumes that the data is already present locally, 
    and focuses on transferring this data to the specified directory.

    Attributes:
    - config (DataIngestionConfig): Configuration settings for data ingestion.
    """

    def __init__(self, config: DataIngestionConfig):
        """
        Initialize the DataIngestion component.

        Args:
        - config (DataIngestionConfig): Configuration settings for data ingestion.
        """
        self.config = config

    def download_data(self):
        """ 
        Placeholder for downloading data functionality. 
        Currently, data is assumed to be locally available.
        """
        pass

    def extract_zip_file(self):
        """
        Placeholder for extracting zip files. 
        If the data comes as a zip file, this method can be used to extract it.
        """
        pass

    def read_data_file(self, file_name: str = "gsearch_jobs.csv") -> pd.DataFrame:
        """
        Read the specified jobs data file into a pandas DataFrame, defaulting to 'gsearch_jobs.csv'.

        Args:
        - file_name (str, optional): The name of the file to be read. Defaults to "gsearch_jobs.csv".

        Returns:
        - df (pd.DataFrame): DataFrame containing the jobs data.

        Raises:
        - FileNotFoundError: If the specified data file does not exist in the artifact directory.
        """
        # Construct the path to the data file in the artifact directory
        artifact_data_path = Path(self.config.root_dir) / file_name
        
        # Check if the artifact data file exists
        if not artifact_data_path.exists():
            logger.error(f"Artifact data file not found at {artifact_data_path}.")
            raise FileNotFoundError(f"No file found at {artifact_data_path}")
        
        # Read the data file into a pandas DataFrame
        df = pd.read_csv(artifact_data_path)
        
        logger.info(f"Data file '{file_name}' read into DataFrame. Shape: {df.shape}.")
        return df

    def transfer_data(self) -> None:
        """
        Transfer the data from the local directory to the project's artifact directory.

        This method ensures that the artifact directory exists, and then transfers 
        the data file to this directory.

        Raises:
        - FileNotFoundError: If the local data file does not exist.
        """
        root_dir = Path(self.config.root_dir)
        local_data_path = Path(self.config.local_data_file)
        
        # Check if the local data file exists
        if not local_data_path.exists():
            logger.error(f"Local data file not found at {local_data_path}.")
            raise FileNotFoundError(f"No file found at {local_data_path}")

        # Get the file size using the utility function
        file_size = get_size(local_data_path)

        # Ensure the transfer directory exists
        os.makedirs(root_dir, exist_ok=True)

        # Transfer the file
        shutil.copy2(local_data_path, root_dir)
        logger.info(f"Data transferred from {local_data_path} to {root_dir}. File size: {file_size}.")

# Build a Pipeline

In [10]:
from IPython.display import display


In [14]:
# from src.career_chief.config.configuration import ConfigurationManager
# from src.career_chief.components.data_ingestion import DataIngestion

from src.career_chief import logger

class DataIngestionPipeline:

    STAGE_NAME = "Data Ingestion Stage"

    def __init__(self):
        self.config_manager = ConfigurationManager()

    def run_data_ingestion(self):
        """
        Main method to run the data ingestion process.
        """
        try:
            logger.info("Fetching data ingestion configuration...")
            data_ingestion_config = self.config_manager.get_data_ingestion_config()
            
            logger.info("Initializing data ingestion process...")
            data_ingestion = DataIngestion(config=data_ingestion_config)
            
            logger.info(f"Copying training data from {data_ingestion_config.local_data_file} to {data_ingestion_config.root_dir}...")
            data_ingestion.transfer_data()
            
        except Exception as e:
            logger.exception("An error occurred during the data ingestion process.")
            raise e
        
    def show_data(self):
        """
        Main method to run the data ingestion process and display the top 10 records of the dataset in a nicely formatted manner using IPython's display in Jupyter notebooks.
        """
        try:
            logger.info("Fetching data ingestion configuration...")
            data_ingestion_config = self.config_manager.get_data_ingestion_config()
            
            logger.info("Initializing data ingestion process...")
            data_ingestion = DataIngestion(config=data_ingestion_config)
            
            logger.info(f"Reading training data from {data_ingestion_config.root_dir}")
            df = data_ingestion.read_data_file()  # Capture the returned DataFrame
            
            # Use IPython's display to show the top 10 records of the DataFrame
            logger.info("Displaying the top 10 records of the dataset:")
            display(df)
            
        except Exception as e:
            logger.exception("An error occurred during the data ingestion process.")
            raise e
        
    def run_pipeline(self):
        """
        Run the data ingestion training pipeline.
        """
        try:
            # logger.info(f">>>>>> Stage: {DataIngestionPipeline.STAGE_NAME} started <<<<<<")
            # self.run_data_ingestion()
            # logger.info(f">>>>>> Stage {DataIngestionPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
            logger.info(">>>>>> Reading & Displaying Data <<<<<< \n\nx==========x")
            self.show_data()

        except Exception as e:
            # No need to log the exception here since it's already logged in the run_data_ingestion method.
            raise e

if __name__ == '__main__':
    pipeline = DataIngestionPipeline()
    pipeline.run_pipeline()




[2024-03-04 12:12:10,315: 41: career_chief_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2024-03-04 12:12:10,316: 41: career_chief_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2024-03-04 12:12:10,318: 41: career_chief_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2024-03-04 12:12:10,319: 64: career_chief_logger: INFO: common:  Created directory at: artifacts]
[2024-03-04 12:12:10,320: 61: career_chief_logger: INFO: 222083905:  >>>>>> Reading & Displaying Data <<<<<< 

[2024-03-04 12:12:10,321: 36: career_chief_logger: INFO: 222083905:  Fetching data ingestion configuration...]
[2024-03-04 12:12:10,321: 64: career_chief_logger: INFO: common:  Created directory at: artifacts/data_ingestion]
[2024-03-04 12:12:10,322: 39: career_chief_logger: INFO: 222083905:  Initializing data ingestion process...]
[2024-03-04 12:12:10,322: 42: career_chief_logger: INFO: 222083905:  Reading training data from artifacts/data_ingestio

Unnamed: 0.1,Unnamed: 0,index,title,company_name,location,via,description,extensions,job_id,thumbnail,...,commute_time,salary_pay,salary_rate,salary_avg,salary_min,salary_max,salary_hourly,salary_yearly,salary_standardized,description_tokens
0,0,0,Data Analyst,Meta,Anywhere,via LinkedIn,In the intersection of compliance and analytic...,"['15 hours ago', '101K–143K a year', 'Work fro...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,101K–143K,a year,122000.0,101000.0,143000.0,,122000.0,122000.0,"['sql', 'r', 'python', 'tableau']"
1,1,1,Data Analyst,ATC,United States,via LinkedIn,Job Title: Entry Level Business Analyst / Prod...,"['12 hours ago', 'Full-time', 'Health insurance']",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QiLCJodGlkb2...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,,,,,,,,,[]
2,2,2,Aeronautical Data Analyst,"Garmin International, Inc.","Olathe, KS",via Indeed,Overview:\n\nWe are seeking a full-time...\nAe...,"['18 hours ago', 'Full-time']",eyJqb2JfdGl0bGUiOiJBZXJvbmF1dGljYWwgRGF0YSBBbm...,,...,,,,,,,,,,['sql']
3,3,3,Data Analyst - Consumer Goods - Contract to Hire,Upwork,Anywhere,via Upwork,Enthusiastic Data Analyst for processing sales...,"['12 hours ago', '15–25 an hour', 'Work from h...",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgLSBDb25zdW...,,...,,15–25,an hour,20.0,15.0,25.0,20.0,,41600.0,"['excel', 'power_bi', 'powerpoint']"
4,4,4,Data Analyst | Workforce Management,Krispy Kreme,United States,via LinkedIn,Overview of Position\n\nThis position will be ...,"['7 hours ago', '90K–110K a year', 'Contractor']",eyJqb2JfdGl0bGUiOiJEYXRhIEFuYWx5c3QgfCBXb3JrZm...,https://encrypted-tbn0.gstatic.com/images?q=tb...,...,,90K–110K,a year,100000.0,90000.0,110000.0,,100000.0,100000.0,"['word', 'excel', 'outlook', 'powerpoint']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37957,37957,600,Marketing Data & BI Analyst II,EDWARD JONES,"Houstonia, MO",via My ArkLaMiss Jobs,"At Edward Jones, we help clients achieve their...","['23 hours ago', '76,798–130,764 a year', 'Ful...",eyJqb2JfdGl0bGUiOiJNYXJrZXRpbmcgRGF0YSBcdTAwMj...,,...,,76798–130764,a year,103781.0,76798.0,130764.0,,103781.0,103781.0,"['python', 'snowflake', 'tableau', 'excel', 'p..."
37958,37958,601,Lead-Data Analyst,EDWARD JONES,"Marshfield, MO",via My ArkLaMiss Jobs,"At Edward Jones, we help clients achieve their...","['23 hours ago', '106,916–182,047 a year', 'Fu...",eyJqb2JfdGl0bGUiOiJMZWFkLURhdGEgQW5hbHlzdCIsIm...,,...,,106916–182047,a year,144481.5,106916.0,182047.0,,144481.5,144481.5,[]
37959,37959,602,Lead-Data Analyst,EDWARD JONES,"High Point, MO",via My ArkLaMiss Jobs,"At Edward Jones, we help clients achieve their...","['23 hours ago', '106,916–182,047 a year', 'Fu...",eyJqb2JfdGl0bGUiOiJMZWFkLURhdGEgQW5hbHlzdCIsIm...,,...,,106916–182047,a year,144481.5,106916.0,182047.0,,144481.5,144481.5,[]
37960,37960,603,Lead-Data Analyst,EDWARD JONES,"Calhoun, MO",via My ArkLaMiss Jobs,"At Edward Jones, we help clients achieve their...","['23 hours ago', '106,916–182,047 a year', 'Fu...",eyJqb2JfdGl0bGUiOiJMZWFkLURhdGEgQW5hbHlzdCIsIm...,,...,,106916–182047,a year,144481.5,106916.0,182047.0,,144481.5,144481.5,[]
