In [None]:
# %% [1] required libraries
import os
import sys

import numpy as np


In [None]:
# Add the parent directory of 'source' to the Python path
sys.path.append(os.path.abspath("../source"))

In [None]:
#local script import
from source.utils.config_loader import load_config
from source.utils.logger import setup_logger
from source.data_preprocessing import (
    load_data,
    fill_missing_values,
    scale_features,
    visualize_missing_values,
    correlation_analysis,
    detect_outliers,
    basic_info,
)


In [None]:
# Paths
RAW_DIR = "../data/raw"
PROCESSED_DIR = "../data/processed"
PLOTS_DIR = "../plots/"
LOG_DIR = "../logs/"

In [None]:
# Logger setup
logger = setup_logger(name="data_preprocessing", log_file=os.path.join(LOG_DIR, "data_preprocessing.log"),
                      log_level="INFO")

In [None]:
# Config file path
CONFIG_PATH = "../config/settings.yml"
config = load_config(CONFIG_PATH)

#  Load the configuration file
RAW_DIR = config["paths"]["raw_dir"]
PROCESSED_DIR = config["paths"]["processed_dir"]
PLOTS_DIR = config["paths"].get("plots_dir", "../plots")


In [None]:
#  Check if the directories exist and create them if they don't
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
logger.info("Notebook initialized.")

In [None]:
# %% Data Loading
file_path = os.path.join(PROCESSED_DIR, "epa_long_preprocessed.csv")

try:
    df = load_data(file_path)
    logger.info("Data successfully loaded.")
except Exception as e:
    logger.error(f"Error loading data: {e}")
    raise

#  basic information
info = basic_info(df)
print(info)


In [None]:
# %%  Visualizing Missing Values
logger.info("Visualizing missing values.")
visualize_missing_values(df)


In [None]:
# %%  Filling Missing Values
logger.info("Filling missing values using mean method.")
df = fill_missing_values(df, method="mean")

#  basic information
info_after_filling = basic_info(df)
print(info_after_filling)


In [None]:
# Scaling Numeric Columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

logger.info("Scaling numeric columns using standard method.")
df = scale_features(df, numeric_cols, method="standard")


In [None]:
#  Correlation Analysis
logger.info("Performing correlation analysis.")
correlation_analysis(df, numeric_cols)


In [None]:
# Detecting Outliers
logger.info("Detecting outliers in numeric columns.")
detect_outliers(df, numeric_cols)


In [None]:
#  Data Preprocessing
output_file = os.path.join(PROCESSED_DIR, "epa_preprocessed.csv")

try:
    df.to_csv(output_file, index=False)
    logger.info(f"Preprocessed data saved successfully at: {output_file}")
except Exception as e:
    logger.error(f"Error saving preprocessed data: {e}")
    raise
