In [1]:
import os
import shutil
import time
import json
import zipfile
import gc
import tracemalloc
import logging
from pathlib import Path
from typing import Optional, List, Dict, Any, Union, Callable

import pandas as pd
import numpy as np
import requests
from tqdm.notebook import tqdm  # Professional progress bars

# Configure Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)

In [2]:
# Configuration
YEARS = list(range(2015, 2025))

# Use pathlib for robust path handling
BASE_DIR = Path.cwd().parent 
DATA_DIR = BASE_DIR / "data_raw"
PROCESSED_DATA_DIR = BASE_DIR / "data_processed"
CONFIG_DIR = BASE_DIR / "config"

# Ensure directories exist
DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# URL Pattern: Note that CDC naming conventions may vary by year.
# This pattern assumes a consistent structure.
BASE_URL = "https://www.cdc.gov/brfss/annual_data/{year}/files/LLCP{year}XPT.zip"

In [3]:
def load_json_config(path: Path) -> Dict[str, Any]:
    """
    Loads a JSON configuration file from the specified path.

    Args:
        path (Path): The file system path to the JSON configuration file.

    Returns:
        Dict[str, Any]: The parsed JSON data as a dictionary.

    Raises:
        FileNotFoundError: If the file does not exist at the specified path.
        json.JSONDecodeError: If the file content is not valid JSON.
    """
    if not path.exists():
        raise FileNotFoundError(f"Configuration file not found: {path}")
    
    logger.info(f"Loading configuration from: {path}")
    with open(path, 'r') as f:
        return json.load(f)

def parse_mapping_config(data: Dict, key_type: type = str, inner_key_type: type = str) -> Dict:
    """
    Parses a raw mapping dictionary, casting keys to specific types (e.g., converting year strings to ints).

    This helper function is designed to normalize the keys of nested or flat dictionaries
    loaded from JSON, ensuring consistent types (like `int` for years or codes) across the application.

    Args:
        data (Dict): The raw dictionary loaded from JSON.
        key_type (type, optional): The target type for the top-level keys. Defaults to str.
        inner_key_type (type, optional): The target type for inner dictionary keys 
            (used for nested structures like `VALUE_MAP`). Defaults to str.

    Returns:
        Dict: A new dictionary with keys cast to the specified types.
    """
    parsed = {}
    for canonical, mappings in data.items():
        if isinstance(mappings, dict):
             # Handle nested dictionaries (e.g., VAR_MAP, VALUE_MAP)
             # Logic: Cast top-level key -> Check if value is dict -> Cast inner keys if so
            parsed[canonical] = {
                (int(k) if key_type == int else k): (
                    {int(ik): iv for ik, iv in v.items()} if isinstance(v, dict) and inner_key_type == int else v
                )
                for k, v in mappings.items()
            }
        else:
            # Handle flat dictionaries (e.g., VALUE_TEXT_MAP)
            parsed[canonical] = {int(k): v for k, v in mappings.items()}
    return parsed

# Load and Parse Configurations
try:
    _var_data = load_json_config(CONFIG_DIR / "VAR_MAP.json")
    # VAR_MAP structure: {canonical: {year (int): col_name (str)}}
    VAR_MAP = parse_mapping_config(_var_data, key_type=int) 
    
    _val_data = load_json_config(CONFIG_DIR / "VALUE_MAP.json")
    # VALUE_MAP structure: {canonical: {year (int): {code (int): label (str)}}}
    VALUE_MAP = parse_mapping_config(_val_data, key_type=int, inner_key_type=int) 
    
    _text_data = load_json_config(CONFIG_DIR / "VALUE_TEXT_MAP.json")
    # VALUE_TEXT_MAP structure: {canonical: {code (int): label (str)}}
    VALUE_TEXT_MAP = parse_mapping_config(_text_data, key_type=int) 

    logger.info(f"Mappings loaded successfully.")
    logger.info(f"- VAR_MAP: {len(VAR_MAP)} fields")
    logger.info(f"- VALUE_MAP: {len(VALUE_MAP)} fields")
    logger.info(f"- VALUE_TEXT_MAP: {len(VALUE_TEXT_MAP)} fields")

except Exception as e:
    logger.error(f"Failed to load configurations: {e}")
    raise

20:13:20 - INFO - Loading configuration from: c:\github\brfss-diabetes-trends\config\VAR_MAP.json
20:13:20 - INFO - Loading configuration from: c:\github\brfss-diabetes-trends\config\VALUE_MAP.json
20:13:20 - INFO - Loading configuration from: c:\github\brfss-diabetes-trends\config\VALUE_TEXT_MAP.json
20:13:20 - INFO - Mappings loaded successfully.
20:13:20 - INFO - - VAR_MAP: 20 fields
20:13:20 - INFO - - VALUE_MAP: 20 fields
20:13:20 - INFO - - VALUE_TEXT_MAP: 20 fields


In [4]:
def ensure_xpt(year, retries: int = 3, timeout: int = 30):
    """
    Ensures that the LLCP{year}.XPT file exists locally, downloading and extracting it if necessary.

    This function performs the following steps:
    1.  **Download:** If the source ZIP file is missing, it is downloaded from `BASE_URL` 
        using exponential backoff for retries.
    2.  **Extraction:** If the XPT file is missing, the function locates the first `.xpt` 
        file within the ZIP archive (case-insensitive).
    3.  **Stabilization:** It extracts the file, handles potential whitespace in the 
        filename, and waits for the file size to stabilize (ensuring extraction is 
        complete) before moving it to the final destination.

    Args:
        year (int): The target year to fetch (e.g., 2019).
        retries (int, optional): The maximum number of download attempts. Defaults to 3.
        timeout (int, optional): The request timeout in seconds for each attempt. Defaults to 30.

    Returns:
        str: The full filesystem path to the local .XPT file.

    Raises:
        RuntimeError: If the ZIP archive does not contain any .xpt files.
        zipfile.BadZipFile: If the local ZIP file is corrupt or invalid.
        requests.exceptions.RequestException: If the download fails after all retries are exhausted.
    """
    DATA_PARENT_DIR = os.path.join(DATA_DIR, f"{year}")
    if not os.path.exists(DATA_PARENT_DIR):
        os.makedirs(DATA_PARENT_DIR, exist_ok=True)
    zip_path = os.path.join(DATA_PARENT_DIR, f"LLCP{year}.zip")
    xpt_path = os.path.join(DATA_PARENT_DIR, f"LLCP{year}.XPT")

    if not os.path.exists(zip_path):
        print(f"[DOWNLOAD] {year}")
        url = BASE_URL.format(year=year)
        for attempt in range(1, retries + 1):
            try:
                with requests.get(url, stream=True, timeout=timeout) as r:
                    r.raise_for_status()
                    with open(zip_path, "wb") as f:
                        for chunk in r.iter_content(chunk_size=8192):
                            if chunk:
                                f.write(chunk)
                break
            except Exception as e:
                print(f"Download attempt {attempt} failed for {year}: {e}")
                if attempt < retries:
                    time.sleep(2 ** attempt)
                else:
                    raise
    else:
        print(f"[CACHED] {year} zip")

    if not os.path.exists(xpt_path):
        print(f"[UNZIP] {year}")
        try:
            with zipfile.ZipFile(zip_path, "r") as z:
                # Find the .xpt file (case insensitive, strip whitespace)
                all_files = z.namelist()
                members = [m for m in all_files if m.lower().strip().endswith(".xpt")]
                if not members:
                    print(f"DEBUG: Contents of zip {year}: {all_files}")
                    raise RuntimeError(f"No .xpt found in zip for {year}")
                
                # extract first .xpt and move it to expected path
                member = members[0]
                member_stripped = member.strip()  # Remove leading/trailing whitespace
                print(f"Extracting {member_stripped}...")
                z.extract(member, DATA_PARENT_DIR)

                # If the extracted filename has whitespace, rename it
                extracted_raw = os.path.join(DATA_PARENT_DIR, member)
                extracted = os.path.join(DATA_PARENT_DIR, member_stripped)
                
                if extracted_raw != extracted and os.path.exists(extracted_raw):
                    print(f"Renaming extracted file (removing whitespace)...")
                    os.rename(extracted_raw, extracted)

                # Wait loop: verify file exists and size is stable (not growing)
                print(f"Waiting for extraction to complete...")
                max_wait = 120  # seconds
                last_size = 0
                stable_count = 0
                
                for i in range(max_wait):
                    if os.path.exists(extracted):
                        try:
                            current_size = os.path.getsize(extracted)
                            if current_size > 0:
                                if current_size == last_size:
                                    stable_count += 1
                                    if stable_count >= 3:  # File size unchanged for 3 seconds = stable
                                        print(f"File extraction complete. Size: {current_size} bytes")
                                        break
                                else:
                                    stable_count = 0
                                last_size = current_size
                        except OSError:
                            pass  # File may be locked during extraction
                    time.sleep(1)
                
                # Additional wait to ensure all buffers are flushed to disk
                print(f"Flushing disk buffers...")
                time.sleep(2)

                # Move to final destination if needed
                os.makedirs(os.path.dirname(xpt_path), exist_ok=True)
                
                if os.path.abspath(extracted) != os.path.abspath(xpt_path):
                    print(f"Moving extracted file to {xpt_path}")
                    try:
                        shutil.move(extracted, xpt_path)
                    except Exception as move_err:
                        print(f"shutil.move failed, using os.replace: {move_err}")
                        if os.path.exists(xpt_path):
                            os.remove(xpt_path)
                        os.replace(extracted, xpt_path)
        except zipfile.BadZipFile as e:
            raise RuntimeError(f"Bad zip file for {year}: {e}")
    else:
        print(f"[CACHED] {year} xpt")

    return xpt_path

In [5]:
def decode_value(canonical: str, year: int, val) -> Optional[str]:
    """
    Decode a single scalar value to its human-readable label.

    Parameters
    ----------
    canonical : str
        Canonical field name (key in VALUE_MAP / VALUE_TEXT_MAP).
    year : int
        Year to use for per-year mapping.
    val : scalar
        Value to decode (int, str convertible to int, or missing).

    Returns
    -------
    str or pd.NA
        Mapped label string, or pd.NA if the value is missing/unmapped.

    Notes
    -----
    - Missing values (pd.isna) or non-integer-convertible values return pd.NA.
    - First attempts to use the per-year mapping in VALUE_MAP; if not found,
      falls back to the constant mapping in VALUE_TEXT_MAP.
    """
    if pd.isna(val):
        return pd.NA
    try:
        key = int(val)
    except (ValueError, TypeError):
        return pd.NA

    # Try per-year mapping first
    per_year_map = VALUE_MAP.get(canonical, {}).get(year, {})
    label = per_year_map.get(key)
    if label is not None:
        return label

    # Fallback to constant mapping
    label = VALUE_TEXT_MAP.get(canonical, {}).get(key)
    return label if label is not None else pd.NA

In [6]:
def decode_series(canonical: str, year: int, series: pd.Series) -> pd.Series:
    """
    Decodes a single scalar value into its human-readable label using a hierarchical lookup.

    This function first attempts to convert the input `val` to an integer. It then resolves 
    the label by checking sources in the following order:
    1.  **Year-Specific:** Checks `VALUE_MAP` for a mapping specific to the provided `year`.
    2.  **Global Fallback:** If no year-specific match is found, checks `VALUE_TEXT_MAP` 
        for a constant/global mapping.

    Args:
        canonical (str): The canonical field name (the key used in the configuration maps).
        year (int): The survey year associated with the value (used for versioned lookups).
        val (Any): The scalar value to decode. This handles integers, strings that can be 
            converted to integers, and missing values.

    Returns:
        Optional[str]: The decoded text label. Returns `pd.NA` in the following cases:
            - The input `val` is missing (`pd.isna`).
            - The input `val` cannot be converted to an integer.
            - No mapping exists for the code in either the year-specific or global maps.
    """
    if series is None:
        return pd.Series(dtype="object")
    if series.empty:
        return pd.Series(index=series.index, dtype="object")

    # coerce to integer codes where possible (nullable Int64)
    codes = pd.to_numeric(series, errors="coerce").astype("Int64")

    per_year_map = VALUE_MAP.get(canonical, {}).get(year, {})
    fallback_map = VALUE_TEXT_MAP.get(canonical, {})
    # fallback_map provides defaults, per_year_map overrides them
    combined_map = {**fallback_map, **per_year_map}

    mapped = codes.map(combined_map)

    # preserve explicit missing codes and ensure unmapped numeric codes become pd.NA
    mapped = mapped.where(~codes.isna(), pd.NA)
    mapped = mapped.where(mapped.notna(), pd.NA)

    return mapped.astype("object")

In [7]:
def normalize_days(series: pd.Series) -> pd.Series:
    """
    Normalizes a Series of day-count values to the 2016 categorical coding standard.

    This transformation simplifies continuous day-count fields (and their specific 
    "None" codes like 88) into a discrete 4-level scale. Input values are first 
    coerced to numeric types.

    Args:
        series (pd.Series): The input Series containing raw day-count data. 
            Handles numeric types or strings convertible to numbers.

    Returns:
        pd.Series: A Series of 'Int64' dtype containing the normalized codes:
            * **1**: Zero days (mapped from raw values `0` or `88`).
            * **2**: 1-13 days (mapped from range `1-13`).
            * **3**: 14+ days (mapped from range `14-96`).
            * **9**: Refused/Unknown (mapped from `9` or `99`).
            * **<NA>**: Any other value, parsing error, or empty input.
    """
    if series is None or series.empty:
        return pd.Series(dtype="Int64")
    nums = pd.to_numeric(series, errors="coerce")

    def _norm(v):
        if pd.isna(v):
            return pd.NA
        vi = int(v)
        if vi in (9, 99):
            return 9
        if vi in (88, 0):
            return 1
        if 1 <= vi <= 13:
            return 2
        if vi >= 14 and vi < 97:
            return 3
        return pd.NA

    normalized = nums.apply(lambda x: _norm(x)).astype("Int64")
    return normalized

In [8]:
def validate_year_mappings(df_raw: pd.DataFrame, year: int) -> list:
    """
    Identifies canonical fields defined in VAR_MAP that are missing from the raw DataFrame.

    This function iterates through the global `VAR_MAP` configuration to find expected 
    column names for the specified `year`. It supports mapping definitions where the 
    target can be a single string or a list of candidate strings. A field is considered 
    "found" if at least one of its candidate columns exists in `df_raw`.

    Args:
        df_raw (pd.DataFrame): The raw DataFrame loaded from the BRFSS XPT file for the given year.
        year (int): The specific year being validated (used to look up expected columns in VAR_MAP).

    Returns:
        list: A list of canonical field names (strings) that could not be resolved to any 
        column in `df_raw`.

    Notes:
        - Prints a summary log to stdout indicating how many fields are missing.
        - If the mapping for a year is `None` or invalid, the field is treated as missing.
    """
    missing = []
    for canonical, year_fields in VAR_MAP.items():
        candidates = None
        if isinstance(year_fields, dict):
            candidates = year_fields.get(year)
        if candidates is None:
            candidates = []
        elif isinstance(candidates, str):
            candidates = [candidates]
        elif not isinstance(candidates, (list, tuple, set)):
            try:
                candidates = list(candidates)
            except Exception:
                candidates = []

        found = any(c in df_raw.columns for c in candidates)
        if not found:
            missing.append(canonical)

    print(f"Year {year}: {len(missing)} of {len(VAR_MAP)} canonical fields missing")
    if missing:
        print(f"Missing fields for {year}: {missing}")
    return missing

In [9]:
# Map canonical field names to specific normalization functions
SPECIAL_TRANSFORMS: Dict[str, Callable[[pd.Series], pd.Series]] = {
    "PHYSICAL_HEALTH_STATUS": normalize_days,
    "MENTAL_HEALTH_STATUS": normalize_days
}

def load_year(year: int) -> pd.DataFrame:
    """
    Loads, maps, and normalizes a single year of BRFSS survey data.

    This function orchestrates the data loading pipeline for a specific year:
    1.  **Fetch:** Ensures the local SAS XPT file exists (via `ensure_xpt`).
    2.  **Load:** Reads the XPT file into a pandas DataFrame using 'latin1' encoding.
    3.  **Map:** Renames raw columns to canonical names using `VAR_MAP`. It handles
        cases where multiple candidate columns exist by picking the first match.
    4.  **Normalize:** Applies special transformation logic (defined in `SPECIAL_TRANSFORMS`)
        to specific fields (e.g., 'PHYSICAL_HEALTH_STATUS').

    Args:
        year (int): The 4-digit year to load (e.g., 2019).

    Returns:
        pd.DataFrame: A normalized DataFrame containing only the canonical columns defined 
        in `VAR_MAP`, plus a 'YEAR' column. Returns an empty DataFrame if the XPT file cannot be read.

    Notes:
        - Logs warnings if canonical columns are missing from the raw data.
        - Fills missing columns with `np.nan`.
    """
    xpt_path = ensure_xpt(year)
    try:
        # 'latin1' is required for SAS XPT files
        df_raw = pd.read_sas(xpt_path, format="xport", encoding="latin1")
    except Exception as e:
        logger.error(f"Failed to read SAS XPT for year {year}: {e}")
        return pd.DataFrame()

    validate_year_mappings(df_raw, year)
    
    out = pd.DataFrame()
    out["YEAR"] = year

    for canonical, year_fields in VAR_MAP.items():
        # Resolve candidates (handle string vs list vs dict lookup)
        candidates = year_fields.get(year, [])
        if isinstance(candidates, str):
            candidates = [candidates]
        
        # Find the first matching column in the raw data
        raw_col = next((df_raw[c] for c in candidates if c in df_raw.columns), None)

        if raw_col is None:
            logger.warning(f"Year {year}: Missing column for '{canonical}' (checked: {candidates})")
            out[canonical] = np.nan
            continue

        # Apply transformations if defined, otherwise pass through
        if canonical in SPECIAL_TRANSFORMS:
            try:
                out[canonical] = SPECIAL_TRANSFORMS[canonical](raw_col)
            except Exception as e:
                logger.error(f"Normalization failed for {canonical} in {year}: {e}")
                out[canonical] = raw_col
        else:
            out[canonical] = raw_col

    return out

In [10]:
def _get_memory_bytes() -> int:
    """
    Returns the current memory usage of the process in bytes.

    This function prioritizes using `psutil` to retrieve the Resident Set Size (RSS), 
    which represents the portion of memory occupied by the process that is held in 
    main memory (RAM). If `psutil` is unavailable or fails, it falls back to 
    `tracemalloc` to report the current size of memory blocks traced by Python.

    Returns:
        int: The memory usage in bytes.
    """
    try:
        import psutil
        proc = psutil.Process(os.getpid())
        return int(proc.memory_info().rss)
    except Exception:
        # Ensure tracemalloc is started
        if not tracemalloc.is_tracing():
            tracemalloc.start()
        current, peak = tracemalloc.get_traced_memory()
        return int(current)

In [11]:
def _format_bytes(b: int) -> str:
    """
    Formats a byte count into a human-readable string with appropriate units (B, KB, MB, etc.).

    This function calculates the most suitable unit based on a binary prefix (1024 base).
    It handles negative values by preserving the sign and treats `None` as 0 bytes.
    The result is formatted to two decimal places.

    Args:
        b (int): The number of bytes to format. Can be negative or None.

    Returns:
        str: A string representing the size (e.g., "1.23 MB", "-500.00 B").
             Returns "0 B" if the input is None.
    """
    if b is None:
        return "0 B"
    sign = "" if b >= 0 else "-"
    b = abs(int(b))
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if b < 1024:
            return f"{sign}{b:.2f} {unit}"
        b /= 1024
    return f"{sign}{b:.2f} PB"

In [12]:
def load_multi_year(years: list, output_dir: Path = PROCESSED_DATA_DIR) -> None:
    """
    Iteratively loads, normalizes, and exports individual yearly BRFSS datasets.

    This function processes a list of years sequentially. Its sole responsibility is 
    generating the standardized `brfss_{year}.csv` files. It does *not* perform 
    any merging.

    For each year, it:
    1.  **Loads** the raw SAS data using `load_year`.
    2.  **Aligns** columns to a strict canonical order defined by `VAR_MAP`.
    3.  **Exports** the normalized data to a CSV file (e.g., `brfss_2019.csv`).
    4.  **Cleans up** memory immediately after writing.

    Args:
        years (list): A list of integer years to process (e.g., `[2019, 2020]`).
        output_dir (Path, optional): The directory path where output files will be saved. 
            Defaults to `PROCESSED_DATA_DIR`.

    Returns:
        None: The output is side-effect based (CSV files written to disk).
    """
    # Ensure consistent column order
    canonical_cols = ["YEAR"] + list(VAR_MAP.keys())
    
    # Use tqdm for a professional progress bar
    pbar = tqdm(years, desc="Generating Yearly CSVs")
    
    for year in pbar:
        mem_before = _get_memory_bytes()
        pbar.set_postfix_str(f"Mem: {_format_bytes(mem_before)}")
        
        df_year = load_year(year)
        
        if df_year.empty:
            logger.warning(f"Year {year}: No data loaded, skipping export.")
            continue

        # Align columns to ensure schema consistency
        df_year = df_year.reindex(columns=canonical_cols)
        
        # Write individual year CSV
        year_csv_path = output_dir / f"brfss_{year}.csv"
        df_year.to_csv(year_csv_path, index=False)
        logger.info(f"Saved {year} data to {year_csv_path.name} ({len(df_year)} rows)")

        # Explicit Memory Cleanup
        del df_year
        gc.collect()
        
        mem_after = _get_memory_bytes()
        logger.debug(f"Year {year} cleanup. Delta: {_format_bytes(mem_after - mem_before)}")

    logger.info("Batch processing complete. Individual CSVs generated.")

In [13]:
def merge_yearly_datasets(data_dir: Path, start_year: int, end_year: int, output_filename: str = "BRFSS_merged.csv") -> str:
    """
    Consolidates existing yearly CSV files into a single merged dataset.

    This method scans the specified directory for files matching the pattern 
    `brfss_{year}.csv` within the inclusive range [`start_year`, `end_year`].
    It reads each file and appends it to a unified CSV, ensuring the header is 
    written only once.

    Args:
        data_dir (Path): The directory containing the yearly CSV files.
        start_year (int): The starting year of the range (inclusive).
        end_year (int): The ending year of the range (inclusive).
        output_filename (str, optional): The name of the final merged CSV file. 
            Defaults to "BRFSS_merged.csv".

    Returns:
        str: The full string path to the merged CSV file.

    Raises:
        FileNotFoundError: If a specific year's CSV file (e.g., `brfss_2015.csv`) 
            is missing from `data_dir`.
    """
    merged_path = data_dir / output_filename
    years_to_merge = list(range(start_year, end_year + 1))
    
    total_rows = 0
    first_batch = True
    
    logger.info(f"Starting merge for years {start_year}-{end_year} into {output_filename}")
    
    # Progress bar for the merging phase
    pbar = tqdm(years_to_merge, desc="Merging Datasets")
    
    for year in pbar:
        source_csv = data_dir / f"brfss_{year}.csv"
        
        if not source_csv.exists():
            logger.warning(f"Source file missing for year {year}: {source_csv}. Skipping.")
            continue
            
        # Read the previously generated CSV
        # We assume these files are already normalized and clean from `load_multi_year`
        try:
            df_chunk = pd.read_csv(source_csv)
        except Exception as e:
            logger.error(f"Failed to read {source_csv}: {e}")
            continue

        row_count = len(df_chunk)
        pbar.set_postfix_str(f"Appending {year} ({row_count} rows)")

        # Append to the consolidated file
        # 'w' mode for the first file (overwrites existing), 'a' mode for subsequent
        write_mode = "w" if first_batch else "a"
        write_header = first_batch
        
        df_chunk.to_csv(merged_path, mode=write_mode, header=write_header, index=False)
        
        total_rows += row_count
        first_batch = False
        
        # Immediate cleanup to keep memory footprint low during merge
        del df_chunk
        gc.collect()

    logger.info(f"Merge Complete. Written {total_rows} rows to {merged_path}")
    return str(merged_path)

In [14]:
start_year = 2015
end_year = 2024
merged_output_file_name = f"BRFSS_{start_year}_{end_year}.csv"
years = list(range(start_year, end_year+1))
output_dir = PROCESSED_DATA_DIR
outputfile = os.path.join(output_dir, merged_output_file_name)

load_multi_year(years, output_dir)

Generating Yearly CSVs:   0%|          | 0/10 [00:00<?, ?it/s]

[DOWNLOAD] 2015
[UNZIP] 2015
Extracting LLCP2015.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 1165490800 bytes
Flushing disk buffers...
Year 2015: 0 of 20 canonical fields missing


20:13:58 - INFO - Saved 2015 data to brfss_2015.csv (441456 rows)


[DOWNLOAD] 2016
[UNZIP] 2016
Extracting LLCP2016.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 1067474400 bytes
Flushing disk buffers...
Year 2016: 0 of 20 canonical fields missing


20:14:53 - INFO - Saved 2016 data to brfss_2016.csv (486303 rows)


[DOWNLOAD] 2017
[UNZIP] 2017
Extracting LLCP2017.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 1288446720 bytes
Flushing disk buffers...
Year 2017: 0 of 20 canonical fields missing


20:16:10 - INFO - Saved 2017 data to brfss_2017.csv (450016 rows)


[DOWNLOAD] 2018
[UNZIP] 2018
Extracting LLCP2018.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 961961120 bytes
Flushing disk buffers...
Year 2018: 0 of 20 canonical fields missing


20:16:58 - INFO - Saved 2018 data to brfss_2018.csv (437436 rows)


[DOWNLOAD] 2019
[UNZIP] 2019
Extracting LLCP2019.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 1136901120 bytes
Flushing disk buffers...
Year 2019: 0 of 20 canonical fields missing


20:17:43 - INFO - Saved 2019 data to brfss_2019.csv (418268 rows)


[DOWNLOAD] 2020
[UNZIP] 2020
Extracting LLCP2020.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 889974880 bytes
Flushing disk buffers...
Year 2020: 0 of 20 canonical fields missing


20:18:19 - INFO - Saved 2020 data to brfss_2020.csv (401958 rows)


[DOWNLOAD] 2021
[UNZIP] 2021
Extracting LLCP2021.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 1055538560 bytes
Flushing disk buffers...
Year 2021: 0 of 20 canonical fields missing


20:19:08 - INFO - Saved 2021 data to brfss_2021.csv (438693 rows)


[DOWNLOAD] 2022
[UNZIP] 2022
Extracting LLCP2022.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 1160060640 bytes
Flushing disk buffers...
Year 2022: 0 of 20 canonical fields missing


20:19:59 - INFO - Saved 2022 data to brfss_2022.csv (445132 rows)


[DOWNLOAD] 2023
[UNZIP] 2023
Extracting LLCP2023.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 1205554400 bytes
Flushing disk buffers...
Year 2023: 0 of 20 canonical fields missing


20:20:47 - INFO - Saved 2023 data to brfss_2023.csv (433323 rows)


[DOWNLOAD] 2024
[UNZIP] 2024
Extracting LLCP2024.XPT...
Renaming extracted file (removing whitespace)...
Waiting for extraction to complete...
File extraction complete. Size: 1093874240 bytes
Flushing disk buffers...
Year 2024: 0 of 20 canonical fields missing


20:21:35 - INFO - Saved 2024 data to brfss_2024.csv (457670 rows)
20:21:35 - INFO - Batch processing complete. Individual CSVs generated.


In [17]:
merge_yearly_datasets(output_dir, start_year, end_year, merged_output_file_name)

# us os pth instead of raw string for better compatibility
csv_path = os.path.join(output_dir, merged_output_file_name)
zip_path = os.path.join(output_dir, f"{merged_output_file_name[:-4]}.zip")

print(f"Zipping {csv_path}...")
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_path, arcname=merged_output_file_name)
print(f"Created {zip_path}")

20:40:30 - INFO - Starting merge for years 2015-2024 into BRFSS_2015_2024.csv


Merging Datasets:   0%|          | 0/10 [00:00<?, ?it/s]

20:41:04 - INFO - Merge Complete. Written 4410255 rows to c:\github\brfss-diabetes-trends\data_processed\BRFSS_2015_2024.csv


Zipping c:\github\brfss-diabetes-trends\data_processed\BRFSS_2015_2024.csv...
Created c:\github\brfss-diabetes-trends\data_processed\BRFSS_2015_2024.zip
