In [None]:
# Import necessary libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
# Import custom modules
from source.utils.config_loader import load_config
from source.utils.logger import setup_logger
from source.missing_value_comparison import compare_missing_values


In [None]:
# Config and Logger Settings
CONFIG_PATH = "../config/settings.yml"

try:
    # Load config file
    config = load_config(CONFIG_PATH)
    if config is None:
        raise ValueError("Config file could not be loaded or returned empty.")

    # Get directories from config and convert to absolute paths
    RAW_DIR = os.path.abspath(config["paths"]["raw_dir"])
    PROCESSED_DIR = os.path.abspath(config["paths"]["processed_dir"])
    LOG_DIR = os.path.abspath(config["paths"].get("logs_dir", "../logs"))

    # Check and create directories if they don't exist
    os.makedirs(LOG_DIR, exist_ok=True)

    # Create logger
    logger = setup_logger(
        name="missing_value_comparison",
        log_file=os.path.join(LOG_DIR, "missing_value_comparison.log"),
        log_level="INFO"
    )
    logger.info("Config file and directories loaded successfully.")
except KeyError as e:
    raise ValueError(f"Missing key in config file: {e}")
except Exception as e:
    raise RuntimeError(f"Error loading config: {e}")


In [None]:
# Function to load data
def load_data(file_path):
    logger.info(f"Loading data from {file_path}")
    try:
        data = pd.read_csv(file_path)
        logger.info("Data loaded successfully")
        return data
    except Exception as e:
        logger.error(f"Error loading data from {file_path}: {e}")
        raise


In [None]:
# Load original data
original_file_path = os.path.join(RAW_DIR, "epa_long_preprocessed.csv")  # Adjust as needed
try:
    original_df = load_data(original_file_path)
    logger.info("Original data loaded successfully.")
except Exception as e:
    logger.critical(f"Failed to load original data: {e}")
    raise

# Load processed data
processed_file_path = os.path.join(PROCESSED_DIR, "epa_preprocessed.csv")
try:
    processed_df = load_data(processed_file_path)
    logger.info("Processed data loaded successfully.")
except Exception as e:
    logger.critical(f"Failed to load processed data: {e}")
    raise


In [None]:
# Compare missing values between original and processed data
try:
    comparison_df = compare_missing_values(original_df, processed_df)
    logger.info("Missing value comparison completed successfully.")
except Exception as e:
    logger.critical(f"Missing value comparison failed: {e}")
    raise


In [None]:
# Visualize comparison results
def visualize_comparison(comparison_df, save=True, filename="missing_value_comparison.png"):
    logger.info("Visualizing missing value comparison.")
    try:
        comparison_df.plot(kind='bar', figsize=(12, 8))
        plt.title("Missing Values Comparison: Original vs Processed")
        plt.xlabel("Columns")
        plt.ylabel("Number of Missing Values")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        if save:
            filepath = os.path.join(PROCESSED_DIR, filename)
            plt.savefig(filepath)
            logger.info(f"Missing value comparison plot saved to {filepath}")
            plt.close()
        else:
            plt.show()
    except Exception as e:
        logger.error(f"Error in visualization of missing value comparison: {e}")
        raise


# Visualize the comparison
visualize_comparison(comparison_df, save=True, filename="missing_value_comparison.png")


In [None]:
# Display the comparison DataFrame
print("Missing Value Comparison:")
print(comparison_df)
