Imports

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import plotly.graph_objects as go
import numpy as np

Read Codebook

In [2]:
with open("USCODE22_LLCP_102523.HTML", "r", encoding='latin1') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, "html.parser")

tables = soup.find_all("table", class_="table", attrs={"summary": "Procedure Report: Report"})

data_element_details = {}
for table in tables:
    data_element_detail = {}
    values = {}
    for i, row in enumerate(table.find_all("tr")):
        cells = [cell.get_text(strip=True, separator='\n').replace('\xa0', ' ') for cell in row.find_all(["td"])]
        if i == 0:
            matches = re.findall(r'(.*?):\s*([^\n]*)', cells[0])
            for match in matches:
                data_element_detail[match[0]] = match[1]
            if "SAS Variable Name" not in data_element_detail:
                break
        if i >= 2:
            values[cells[0]] = {"Value Label": cells[1], "Frequency": cells[2], "Percentage": cells[3], "Weighted Percentage": cells[4]}
    if "SAS Variable Name" in data_element_detail:
        data_element_detail['Valid Values'] = values
        data_element_details[data_element_detail["SAS Variable Name"]] = data_element_detail

Output as csv for manual labelling

In [8]:
pd.DataFrame.from_dict(data_element_details, orient="index").to_csv("fields.csv")

Read fixed width file data positions from html

In [None]:
url = 'CDC - BRFSS 2022 Combined Landline and Telephone Multiple Data.html'
tables = pd.read_html(url)
df_cols = tables[0]
column_names = df_cols['Variable Name']
start_positions = df_cols['Starting Column']
widths = df_cols['Field Length']
colspecs = list(zip(start_positions - 1, start_positions - 1 + widths))

Load file using fixed width positions extracted above

In [None]:
df = pd.read_fwf('LLCP2022.ASC', colspecs=colspecs, header=None, names=column_names)
df = df.fillna(value='BLANK')

In [None]:
df

In [None]:
def process_variable(var):
    if isinstance(var, int):
        result = str(var)
    elif isinstance(var, float):
        result = str(int(var))
    else:
        result = var
    
    return result

In [None]:
new_column_names = []
for column_name in df.columns:
    if column_name in data_element_details and 'Label' in data_element_details[column_name]:
        new_column_names.append(data_element_details[column_name]['Label'])
        values_map = {key: value['Value Label'] for key, value in data_element_details[column_name]['Valid Values'].items()}
        df[column_name] = df[column_name].map(lambda x: values_map.get(process_variable(x), x))
    else:
        new_column_names.append(column_name)
    
df.columns = new_column_names

In [None]:
value_counts = df['Final Disposition'].value_counts()

fig = go.Figure(data=[go.Bar(
    x=value_counts.index,  # Unique values in the Series
    y=value_counts.values  # Frequency counts
)])

fig.update_layout(
    title='Histogram of Final Disposition',
    xaxis_title='Final Disposition',
    yaxis_title='Count'
)

fig.show()

In [None]:
df['Ever tested H.I.V.'].value_counts()