In [4]:
import os
import glob
import logging
import pandas as pd

# Define ANSI escape codes for green color
GREEN = "\033[92m"
RESET = "\033[0m"

# Custom logging formatter to include green color
class CustomFormatter(logging.Formatter):
    def format(self, record):
        log_msg = super().format(record)
        return f"{GREEN}{log_msg}{RESET}"

# Configure logging with the custom formatter
handler = logging.StreamHandler()
handler.setFormatter(CustomFormatter("%(asctime)s [%(levelname)s] %(message)s"))
logging.basicConfig(level=logging.INFO, handlers=[handler])

In [5]:
def check_raw_data(raw_dir: str = "../data/raw") -> None:
    logging.info(f"Starting data check in: {raw_dir}")

    csv_paths = glob.glob(os.path.join(raw_dir, "**/*.csv"), recursive=True)
    logging.info(f"Found {len(csv_paths)} CSV files.")

    for path in csv_paths[:5]:
        logging.info(f"Checking file: {path}")
        try:
            df = pd.read_csv(path, nrows=5000)  # read partial for speed
            shape_str = f"Shape={df.shape}"
            cols_str = ", ".join(df.columns)
            missing_str = df.isnull().sum().to_dict()

            logging.info(f" {shape_str}; Columns=({cols_str})")
            logging.info(f" Missing counts: {missing_str}")
        except Exception as e:
            logging.warning(f"  Could not read {path}. Error: {e}")

    logging.info("Data check completed!")

def main():
    check_raw_data()

if __name__ == "__main__":
    main()

[92m2024-12-31 20:10:17,038 [INFO] Starting data check in: ../data/raw[0m
[92m2024-12-31 20:10:17,045 [INFO] Found 515 CSV files.[0m
[92m2024-12-31 20:10:17,045 [INFO] Checking file: ../data/raw/epa-so2-2023/epa_so2_virginia_2023.csv[0m
[92m2024-12-31 20:10:17,075 [INFO]  Shape=(2877, 21); Columns=(Date, Source, Site ID, POC, Daily Max 1-hour SO2 Concentration, Units, Daily AQI Value, Local Site Name, Daily Obs Count, Percent Complete, AQS Parameter Code, AQS Parameter Description, Method Code, CBSA Code, CBSA Name, State FIPS Code, State, County FIPS Code, County, Site Latitude, Site Longitude)[0m
[92m2024-12-31 20:10:17,075 [INFO]  Missing counts: {'Date': 0, 'Source': 0, 'Site ID': 0, 'POC': 0, 'Daily Max 1-hour SO2 Concentration': 0, 'Units': 0, 'Daily AQI Value': 0, 'Local Site Name': 352, 'Daily Obs Count': 0, 'Percent Complete': 0, 'AQS Parameter Code': 0, 'AQS Parameter Description': 0, 'Method Code': 0, 'CBSA Code': 0, 'CBSA Name': 0, 'State FIPS Code': 0, 'State': 0,