# Diabetes Risk Prediction
This project uses the Behavioral Risk Factor Surveillance System (BRFSS) survey data from [this link](https://www.cdc.gov/brfss/annual_data/annual_2024.html) to predict the probability of developing different types of Diabetes. Features about U.S. residents include demographic data (e.g. income level, education, race) as well as data regarding health-related risk behaviors, chronic health conditions, and use of preventive services.

This is the first notebook for the project, which parses the raw ASCII data file, available in the link above, to extract the relevant target and feature variables for subsequent EDA and modeling.

## Setup
### Define parameters

In [8]:
raw_data_url = "https://www.cdc.gov/brfss/annual_data/2024/files/LLCP2024ASC.zip"
data_dict_url = "https://www.cdc.gov/brfss/annual_data/2024/zip/codebook24_llcp-v2-508.zip"

### Import packages

In [9]:
import pandas as pd
from bs4 import BeautifulSoup
import re

### Define Functions

In [None]:
def build_column_lookup(html_file):
    """
    Parse HTML data dictionary to extract column definitions.
    
    Parameters:
    -----------
    html_file : str
        Path to the HTML data dictionary file
    
    Returns:
    --------
    dict : Dictionary mapping variable labels to their metadata
           Format: {label: {'column_range': str, 'type': str, 'sas_name': str}}
    """
    with open(html_file, 'r', encoding='windows-1252') as f:
        soup = BeautifulSoup(f, 'html.parser')
    
    column_lookup = {}
    
    # Find all variable definition cells in table headers
    for cell in soup.find_all('td', {'class': 'l m linecontent'}):
        text = cell.get_text()
        
        # Only process cells that contain variable definitions
        if 'Label:' in text and 'Column:' in text:
            # Extract label (between "Label:" and "Section Name:")
            # Note: HTML uses \xa0 (non-breaking spaces)
            label_match = re.search(r'Label:[\s\xa0]+(.+?)Section[\s\xa0]+Name:', text)
            
            # Extract column range (format: "N" or "N-M")
            column_match = re.search(r'Column:[\s\xa0]+(\d+(?:-\d+)?)', text)
            
            # Extract variable type (Num or Char)
            type_match = re.search(r'Type[\s\xa0]+of[\s\xa0]+Variable:[\s\xa0]+(Num|Char)', text)
            
            # Extract SAS variable name (stops before "Question")
            varname_match = re.search(r'SAS[\s\xa0]+Variable[\s\xa0]+Name:[\s\xa0]+(\w+?)(?=Question)', text)
            
            if label_match and column_match:
                # Replace \xa0 (non-breaking space) with regular space for easier lookups
                label = label_match.group(1).strip().replace('\xa0', ' ')
                column_range = column_match.group(1)
                var_type = type_match.group(1) if type_match else None
                var_name = varname_match.group(1) if varname_match else None
                
                column_lookup[label] = {
                    'column_range': column_range,
                    'type': var_type,
                    'sas_name': var_name
                }
    
    print(f"Successfully parsed {len(column_lookup)} variable definitions from HTML dictionary")
    return column_lookup


def parse_value_labels(html_file):
    """
    Parse HTML data dictionary to extract value-to-label mappings.
    
    Parameters:
    -----------
    html_file : str
        Path to the HTML data dictionary file
    
    Returns:
    --------
    dict : Dictionary mapping SAS variable names to their value-label mappings
           Format: {sas_variable_name: {value: label}}
    """
    with open(html_file, 'r', encoding='windows-1252') as f:
        soup = BeautifulSoup(f, 'html.parser')
    
    codebook = {}
    
    # Find all variable tables
    tables = soup.find_all('table', {'class': 'table'})
    
    for table in tables:
        # Extract variable name from metadata cell
        metadata_cell = table.find('td', {'class': 'l m linecontent'})
        if not metadata_cell:
            continue
            
        metadata_text = metadata_cell.get_text()
        var_match = re.search(r'SAS[\s\xa0]+Variable[\s\xa0]+Name:[\s\xa0]+(\w+)', metadata_text)
        if not var_match:
            continue
        
        var_name = var_match.group(1)
        
        # Extract value-label mappings from table body
        tbody = table.find('tbody')
        if not tbody:
            continue
        
        value_labels = {}
        for row in tbody.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) < 2:
                continue
            
            # Extract value (first column)
            value = cells[0].get_text(strip=True)
            
            # Skip range values (e.g., "1 - 97", "50 - 0776")
            # These are for continuous variables and should stay numeric
            if ' - ' in value:
                continue
            
            # Extract label (second column)
            label_html = cells[1]
            
            # Get text and split by line breaks to separate notes
            label_text = label_html.get_text(separator='|')
            label_parts = label_text.split('|')
            
            # Take first part (before notes)
            label = label_parts[0].strip()
            
            # Clean skip logic (remove "→Go to..." instructions)
            label = re.sub(r'→Go to.*$', '', label).strip()
            
            # Store mapping
            value_labels[value] = label
        
        # Only add to codebook if there are mappings
        if value_labels:
            codebook[var_name] = value_labels
    
    print(f"Successfully parsed value labels for {len(codebook)} variables")
    return codebook


def apply_value_labels(df, codebook, columns_to_label=None):
    """
    Apply value-to-label mappings to DataFrame columns.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with numeric/coded values
    codebook : dict
        Value label mappings from parse_value_labels()
    columns_to_label : list, optional
        Specific columns to label. If None, attempts to label all columns.
    
    Returns:
    --------
    pd.DataFrame : DataFrame with values replaced by labels
    """
    df_labeled = df.copy()
    
    if columns_to_label is None:
        columns_to_label = df.columns
    
    labeled_count = 0
    
    for col in columns_to_label:
        # Check if column has value labels in codebook
        if col not in codebook:
            continue
        
        value_map = codebook[col]
        
        # Apply mapping, preserving NaN for unmapped values
        df_labeled[col] = df[col].astype(str).str.strip().map(value_map)
        
        # Count how many values were successfully mapped
        mapped = df_labeled[col].notna().sum()
        if mapped > 0:
            labeled_count += 1
            print(f"  Labeled {col}: {mapped:,} values mapped")
    
    print(f"\nSuccessfully labeled {labeled_count} columns")
    return df_labeled

## Download data
### Raw data ASCII file

In [5]:
!wget {raw_data_url}

--2025-11-04 23:15:44--  https://www.cdc.gov/brfss/annual_data/2024/files/LLCP2024ASC.zip
Resolving www.cdc.gov (www.cdc.gov)... 23.6.96.221, 2600:1409:9800:1a88::2461, 2600:1409:9800:1a82::2461
Connecting to www.cdc.gov (www.cdc.gov)|23.6.96.221|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53411907 (51M) [application/x-zip-compressed]
Saving to: ‘LLCP2024ASC.zip.1’


2025-11-04 23:15:45 (42.9 MB/s) - ‘LLCP2024ASC.zip.1’ saved [53411907/53411907]



In [6]:
asc_zip = !ls LLCP2024ASC*.zip

In [12]:
asc_file = !ls LLCP2024*.ASC*
asc_file = asc_file[0]

### Data dictionary file (HTML)

In [23]:
!wget {data_dict_url}

--2025-11-04 23:39:22--  https://www.cdc.gov/brfss/annual_data/2024/zip/codebook24_llcp-v2-508.zip
Resolving www.cdc.gov (www.cdc.gov)... 23.6.96.221, 2600:1409:9800:1a82::2461, 2600:1409:9800:1a88::2461
Connecting to www.cdc.gov (www.cdc.gov)|23.6.96.221|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71439 (70K) [application/x-zip-compressed]
Saving to: ‘codebook24_llcp-v2-508.zip’


2025-11-04 23:39:27 (3.56 MB/s) - ‘codebook24_llcp-v2-508.zip’ saved [71439/71439]



In [24]:
dict_zip = !ls codebook24_llcp*.zip

In [26]:
!unzip {dict_zip[0]}

Archive:  codebook24_llcp-v2-508.zip
  inflating: USCODE24_LLCP_082125.HTML  


In [13]:
html_file = !ls USCODE24_LLCP*.HTML
html_file = html_file[0]

## Parse data

In [None]:
# Step 1: Define columns you want to extract (using labels from HTML)
# To add more columns, simply add their labels to this list
columns_to_extract = [
    "State FIPS Code",
    "Annual Sequence Number",
    "(Ever told) you had diabetes",
    "Ever been told by a doctor or other health professional that you have pre-diabetes or borderline diabetes?",
    "What type of diabetes do you have?",
    "Urban/Rural Status",
    "Reported age in five-year age categories calculated variable",
    "Sex of Respondent",
    "Computed Race-Ethnicity grouping",
    "Education Level",
    "Income Level",
    "Have Personal Health Care Provider?",
    "Could Not Afford To See Doctor",
    "Computed Weight in Kilograms",
    "Computed Height in Meters",
    "Computed body mass index",
    "Exercise in Past 30 Days",
    "How often did you drink regular soda or pop that contains sugar?",
    "How often did you drink sugar-sweetened drinks?",
    "Computed Smoking Status",
    "Computed number of drinks of alcohol beverages per week",
    "General Health",
    "Ever Diagnosed with Heart Attack",
    "Ever Diagnosed with Angina or Coronary Heart Disease",
    "Ever Diagnosed with a Stroke",
    "Ever told you have kidney disease?",
    "Ever Told Had Asthma",
    "(Ever told) you had a depressive disorder",
    "Told Had Arthritis"
]

# Step 2: Build lookup dictionary from HTML data dictionary
print("Building column lookup...")
column_lookup = build_column_lookup(html_file)

# Step 2b: Parse value labels from HTML
print("\nParsing value labels...")
codebook = parse_value_labels(html_file)

# Step 3: Convert labels to colspecs for pd.read_fwf()
colspecs = []
column_names = []
dtypes = {}

print("\nMapping columns:")
for label in columns_to_extract:
    if label in column_lookup:
        col_info = column_lookup[label]
        col_range = col_info['column_range']
        
        # Parse "1-2" or "149" format
        if '-' in col_range:
            start, end = map(int, col_range.split('-'))
        else:
            start = end = int(col_range)
        
        # Convert to 0-based indexing for Python
        colspecs.append((start - 1, end))
        
        # Use SAS variable name for column name (to match codebook keys)
        col_name = col_info['sas_name']
        column_names.append(col_name)
        
        # Set dtype (start with string for safety, can convert later)
        dtypes[col_name] = str
        
        print(f"  {label} -> {col_name} (columns {col_range})")
    else:
        print(f"  WARNING: '{label}' not found in data dictionary")

print(f"\nPrepared to extract {len(colspecs)} columns from ASCII file")

# Step 4: Read the ASCII file using pd.read_fwf()
print(f"\nReading ASCII file: {asc_file}")
df = pd.read_fwf(
    asc_file, 
    colspecs=colspecs,
    names=column_names,
    dtype=dtypes,
    encoding='ascii'
)

print(f"Successfully loaded {len(df):,} rows and {len(df.columns)} columns")

# Step 5: Apply value labels to DataFrame
print("\nApplying value labels to DataFrame...")
df = apply_value_labels(df, codebook)

# Step 6: Display results
print("\nFirst 5 rows:")
display(df.head())

print("\nDataset info:")
print(df.info())

print("\nColumn value counts:")
for col in df.columns:
    print(f"\n{col}:")
    print(df[col].value_counts().head(10))

In [37]:
len(df) == len(df.drop_duplicates(subset=['_STATE', 'SEQNO']))

True

In [None]:
# Column labels that are potential target variables for diabetes risk
target_col = "DIABETE4"

In [None]:
feature_labels = [
    "Urban/Rural Status",
    "Reported age in five-year age categories calculated variable",
    "Sex of Respondent",
    "Computed Race-Ethnicity grouping",
    "Education Level",
    "Income Level",
    "Have Personal Health Care Provider?",
    "Could Not Afford To See Doctor",
    "Computed Weight in Kilograms",
    "Computed Height in Meters",
    "Computed body mass index",
    "Exercise in Past 30 Days",
    "How often did you drink regular soda or pop that contains sugar?",
    "How often did you drink sugar-sweetened drinks?",
    "Computed Smoking Status",
    "Computed number of drinks of alcohol beverages per week",
    "General Health",
    "Ever Diagnosed with Heart Attack",
    "Ever Diagnosed with Angina or Coronary Heart Disease",
    "Ever Diagnosed with a Stroke",
    "Ever told you have kidney disease?",
    "Ever Told Had Asthma",
    "(Ever told) you had a depressive disorder",
    "Told Had Arthritis"
]

In [23]:
[c for c in column_lookup.keys() if "Ever Told Had Asthma" in c]

['Ever Told Had Asthma']