# Diabetes Risk Prediction
This project uses the Behavioral Risk Factor Surveillance System (BRFSS) survey data from [this link](https://www.cdc.gov/brfss/annual_data/annual_2024.html) to predict the probability of developing different types of Diabetes. Features about U.S. residents include demographic data (e.g. income level, education, race) as well as data regarding health-related risk behaviors, chronic health conditions, and use of preventive services.

This is the first notebook for the project, which parses the raw ASCII data file, available in the link above, to extract the relevant target and feature variables for subsequent EDA and modeling.

## Setup
### Define parameters

In [4]:
raw_data_url = "https://www.cdc.gov/brfss/annual_data/2024/files/LLCP2024ASC.zip"
data_dict_url = "https://www.cdc.gov/brfss/annual_data/2024/zip/codebook24_llcp-v2-508.zip"

### Import packages

In [5]:
import pandas as pd
from bs4 import BeautifulSoup
import re

### Define Functions

In [None]:
def parse_brfss_dictionary(html_file):
    """
    Parse HTML data dictionary to extract both column definitions and value-to-label mappings
    in a single pass through the file.
    
    Parameters:
    -----------
    html_file : str
        Path to the HTML data dictionary file
    
    Returns:
    --------
    tuple : (column_lookup, codebook)
        - column_lookup: dict mapping variable labels to metadata
          Format: {label: {'column_range': str, 'type': str, 'sas_name': str}}
        - codebook: dict mapping SAS variable names to value-label mappings
          Format: {sas_variable_name: {value: label}}
    """
    with open(html_file, 'r', encoding='windows-1252') as f:
        soup = BeautifulSoup(f, 'html.parser')
    
    column_lookup = {}
    codebook = {}
    
    # Find all variable tables (one pass through HTML)
    tables = soup.find_all('table', {'class': 'table'})
    
    for table in tables:
        # Extract metadata from header cell
        metadata_cell = table.find('td', {'class': 'l m linecontent'})
        if not metadata_cell:
            continue
        
        metadata_text = metadata_cell.get_text()
        
        # Only process cells that contain variable definitions
        if 'Label:' not in metadata_text or 'Column:' not in metadata_text:
            continue
        
        # Extract label (between "Label:" and "Section Name:")
        # Note: HTML uses \xa0 (non-breaking spaces)
        label_match = re.search(r'Label:[\s\xa0]+(.+?)Section[\s\xa0]+Name:', metadata_text)
        
        # Extract column range (format: "N" or "N-M")
        column_match = re.search(r'Column:[\s\xa0]+(\d+(?:-\d+)?)', metadata_text)
        
        # Extract variable type (Num or Char)
        type_match = re.search(r'Type[\s\xa0]+of[\s\xa0]+Variable:[\s\xa0]+(Num|Char)', metadata_text)
        
        # Extract SAS variable name (stops before "Question")
        varname_match = re.search(r'SAS[\s\xa0]+Variable[\s\xa0]+Name:[\s\xa0]+(\w+?)(?=Question)', metadata_text)
        
        if not (label_match and column_match and varname_match):
            continue
        
        # Store column metadata
        label = label_match.group(1).strip().replace('\xa0', ' ')
        column_range = column_match.group(1)
        var_type = type_match.group(1) if type_match else None
        var_name = varname_match.group(1)
        
        column_lookup[label] = {
            'column_range': column_range,
            'type': var_type,
            'sas_name': var_name
        }
        
        # Calculate column width for this variable
        # This determines the zero-padding needed for values
        if '-' in column_range:
            start, end = map(int, column_range.split('-'))
            column_width = end - start + 1
        else:
            column_width = 1
        
        # Extract value-label mappings from table body
        tbody = table.find('tbody')
        if not tbody:
            continue
        
        value_labels = {}
        has_categorical_values = False
        
        for row in tbody.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) < 2:
                continue
            
            # Extract value (first column)
            value = cells[0].get_text(strip=True)
            
            # Skip range values (e.g., "1 - 97", "50 - 0776")
            # These indicate continuous variables
            if ' - ' in value:
                continue
            
            # Extract label (second column)
            label_html = cells[1]
            
            # Get text and split by line breaks to separate notes
            label_text = label_html.get_text(separator='|')
            label_parts = label_text.split('|')
            
            # Take first part (before notes)
            value_label = label_parts[0].strip()
            
            # Clean skip logic (remove "→Go to..." instructions)
            value_label = re.sub(r'→Go to.*$', '', value_label).strip()
            
            # Check if this is a categorical value (not just special codes)
            # Special codes: HIDDEN, BLANK, and codes like 7/9/77/99/777/999
            is_special = value in ['HIDDEN', 'BLANK'] or re.match(r'^[79]+$', value)
            if not is_special:
                has_categorical_values = True
            
            # Pad numeric values to match the column width in the ASCII file
            # This ensures codebook values match the fixed-width format
            if value.isdigit() and len(value) < column_width:
                value_padded = value.zfill(column_width)
            else:
                value_padded = value
            
            # Store mapping with properly padded value
            value_labels[value_padded] = value_label
        
        # Only add to codebook if there are meaningful categorical values
        # Skip variables that only have HIDDEN, BLANK, or special codes
        if value_labels and has_categorical_values:
            codebook[var_name] = value_labels
    
    print(f"Successfully parsed {len(column_lookup)} variable definitions and {len(codebook)} value label mappings from HTML dictionary")
    return column_lookup, codebook


def apply_value_labels(df, codebook, columns_to_label=None):
    """
    Apply value-to-label mappings to DataFrame columns.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with numeric/coded values
    codebook : dict
        Value label mappings from parse_brfss_dictionary()
    columns_to_label : list, optional
        Specific columns to label. If None, attempts to label all columns.
    
    Returns:
    --------
    pd.DataFrame : DataFrame with values replaced by labels
    """
    df_labeled = df.copy()
    
    if columns_to_label is None:
        columns_to_label = df.columns
    
    labeled_count = 0
    skipped_vars = []
    
    for col in columns_to_label:
        # Check if column has value labels in codebook
        if col not in codebook:
            continue
        
        value_map = codebook[col]
        
        # Test mapping on a sample to see if it's appropriate
        # If less than 50% of non-null values can be mapped, skip this variable
        # (it's likely a continuous variable or identifier)
        sample = df[col].dropna().head(1000)
        if len(sample) > 0:
            test_mapped = sample.astype(str).str.strip().map(value_map)
            mapping_success_rate = test_mapped.notna().sum() / len(sample)
            
            if mapping_success_rate < 0.5:
                skipped_vars.append(f"{col} ({mapping_success_rate:.1%} mappable)")
                continue
        
        # Apply mapping
        df_labeled[col] = df[col].astype(str).str.strip().map(value_map)
        
        # Count how many values were successfully mapped
        mapped = df_labeled[col].notna().sum()
        if mapped > 0:
            labeled_count += 1
            print(f"  Labeled {col}: {mapped:,} / {df[col].notna().sum():,} values ({mapped/df[col].notna().sum()*100:.1f}%)")
    
    if skipped_vars:
        print(f"\n  Skipped (continuous/identifier): {', '.join(skipped_vars)}")
    
    print(f"\nSuccessfully labeled {labeled_count} columns")
    return df_labeled

## Download data
### Raw data ASCII file

In [5]:
!wget {raw_data_url}

--2025-11-04 23:15:44--  https://www.cdc.gov/brfss/annual_data/2024/files/LLCP2024ASC.zip
Resolving www.cdc.gov (www.cdc.gov)... 23.6.96.221, 2600:1409:9800:1a88::2461, 2600:1409:9800:1a82::2461
Connecting to www.cdc.gov (www.cdc.gov)|23.6.96.221|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53411907 (51M) [application/x-zip-compressed]
Saving to: ‘LLCP2024ASC.zip.1’


2025-11-04 23:15:45 (42.9 MB/s) - ‘LLCP2024ASC.zip.1’ saved [53411907/53411907]



In [6]:
asc_zip = !ls LLCP2024ASC*.zip

In [7]:
asc_file = !ls LLCP2024*.ASC*
asc_file = asc_file[0]

### Data dictionary file (HTML)

In [23]:
!wget {data_dict_url}

--2025-11-04 23:39:22--  https://www.cdc.gov/brfss/annual_data/2024/zip/codebook24_llcp-v2-508.zip
Resolving www.cdc.gov (www.cdc.gov)... 23.6.96.221, 2600:1409:9800:1a82::2461, 2600:1409:9800:1a88::2461
Connecting to www.cdc.gov (www.cdc.gov)|23.6.96.221|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71439 (70K) [application/x-zip-compressed]
Saving to: ‘codebook24_llcp-v2-508.zip’


2025-11-04 23:39:27 (3.56 MB/s) - ‘codebook24_llcp-v2-508.zip’ saved [71439/71439]



In [24]:
dict_zip = !ls codebook24_llcp*.zip

In [26]:
!unzip {dict_zip[0]}

Archive:  codebook24_llcp-v2-508.zip
  inflating: USCODE24_LLCP_082125.HTML  


In [8]:
html_file = !ls USCODE24_LLCP*.HTML
html_file = html_file[0]

## Parse data

In [None]:
# Step 1: Define columns you want to extract (using labels from HTML)
# To add more columns, simply add their labels to this list
columns_to_extract = [
    "State FIPS Code",
    "Annual Sequence Number",
    "(Ever told) you had diabetes",
    "Ever been told by a doctor or other health professional that you have pre-diabetes or borderline diabetes?",
    "What type of diabetes do you have?",
    "Urban/Rural Status",
    "Reported age in five-year age categories calculated variable",
    "Sex of Respondent",
    "Computed Race-Ethnicity grouping",
    "Education Level",
    "Income Level",
    "Have Personal Health Care Provider?",
    "Could Not Afford To See Doctor",
    "Computed Weight in Kilograms",
    "Computed Height in Meters",
    "Computed body mass index",
    "Exercise in Past 30 Days",
    "How often did you drink regular soda or pop that contains sugar?",
    "How often did you drink sugar-sweetened drinks?",
    "Computed Smoking Status",
    "Computed number of drinks of alcohol beverages per week",
    "General Health",
    "Ever Diagnosed with Heart Attack",
    "Ever Diagnosed with Angina or Coronary Heart Disease",
    "Ever Diagnosed with a Stroke",
    "Ever told you have kidney disease?",
    "Ever Told Had Asthma",
    "(Ever told) you had a depressive disorder",
    "Told Had Arthritis"
]

# Step 2: Parse HTML data dictionary (single pass for both metadata and value labels)
print("Parsing HTML data dictionary...")
column_lookup, codebook = parse_brfss_dictionary(html_file)

# Step 3: Convert labels to colspecs for pd.read_fwf()
colspecs = []
column_names = []
dtypes = {}

print("\nMapping columns:")
for label in columns_to_extract:
    if label in column_lookup:
        col_info = column_lookup[label]
        col_range = col_info['column_range']
        
        # Parse "1-2" or "149" format
        if '-' in col_range:
            start, end = map(int, col_range.split('-'))
        else:
            start = end = int(col_range)
        
        # Convert to 0-based indexing for Python
        colspecs.append((start - 1, end))
        
        # Use SAS variable name for column name (to match codebook keys)
        col_name = col_info['sas_name']
        column_names.append(col_name)
        
        # Set dtype (start with string for safety, can convert later)
        dtypes[col_name] = str
        
        print(f"  {label} -> {col_name} (columns {col_range})")
    else:
        print(f"  WARNING: '{label}' not found in data dictionary")

print(f"\nPrepared to extract {len(colspecs)} columns from ASCII file")

# Step 4: Read the ASCII file using pd.read_fwf()
print(f"\nReading ASCII file: {asc_file}")
df = pd.read_fwf(
    asc_file, 
    colspecs=colspecs,
    names=column_names,
    dtype=dtypes,
    encoding='ascii'
)

print(f"Successfully loaded {len(df):,} rows and {len(df.columns)} columns")

# Step 5: Apply value labels to DataFrame
print("\nApplying value labels to DataFrame...")
df = apply_value_labels(df, codebook)

# Step 6: Display results
print("\nFirst 5 rows:")
display(df.head())

print("\nDataset info:")
print(df.info())

print("\nColumn value counts:")
for col in df.columns:
    print(f"\n{col}:")
    print(df[col].value_counts().head(10))

In [37]:
len(df) == len(df.drop_duplicates(subset=['_STATE', 'SEQNO']))

True

In [None]:
# Column labels that are potential target variables for diabetes risk
target_col = "DIABETE4"

In [None]:
feature_labels = [
    "Urban/Rural Status",
    "Reported age in five-year age categories calculated variable",
    "Sex of Respondent",
    "Computed Race-Ethnicity grouping",
    "Education Level",
    "Income Level",
    "Have Personal Health Care Provider?",
    "Could Not Afford To See Doctor",
    "Computed Weight in Kilograms",
    "Computed Height in Meters",
    "Computed body mass index",
    "Exercise in Past 30 Days",
    "How often did you drink regular soda or pop that contains sugar?",
    "How often did you drink sugar-sweetened drinks?",
    "Computed Smoking Status",
    "Computed number of drinks of alcohol beverages per week",
    "General Health",
    "Ever Diagnosed with Heart Attack",
    "Ever Diagnosed with Angina or Coronary Heart Disease",
    "Ever Diagnosed with a Stroke",
    "Ever told you have kidney disease?",
    "Ever Told Had Asthma",
    "(Ever told) you had a depressive disorder",
    "Told Had Arthritis"
]

In [11]:
codebook['_AGEG5YR']

{'1': 'Age 18 to 24',
 '2': 'Age 25 to 29',
 '3': 'Age 30 to 34',
 '4': 'Age 35 to 39',
 '5': 'Age 40 to 44',
 '6': 'Age 45 to 49',
 '7': 'Age 50 to 54',
 '8': 'Age 55 to 59',
 '9': 'Age 60 to 64',
 '10': 'Age 65 to 69',
 '11': 'Age 70 to 74',
 '12': 'Age 75 to 79',
 '13': 'Age 80 or older',
 '14': 'Donï¿½t know/Refused/Missing'}