Imports

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re

Read Codebook

In [2]:
with open("../data/USCODE22_LLCP_102523.HTML", "r", encoding='latin1') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, "html.parser")

tables = soup.find_all("table", class_="table", attrs={"summary": "Procedure Report: Report"})

data_element_details = {}
for table in tables:
    data_element_detail = {}
    values = {}
    for i, row in enumerate(table.find_all("tr")):
        cells = [cell.get_text(strip=True, separator='\n').replace('\xa0', ' ') for cell in row.find_all(["td"])]
        if i == 0:
            matches = re.findall(r'(.*?):\s*([^\n]*)', cells[0])
            for match in matches:
                data_element_detail[match[0]] = match[1]
            if "SAS Variable Name" not in data_element_detail:
                break
        if i >= 2:
            notes_index = cells[1].find("\nNotes:")
            if notes_index != -1:
                value_label = cells[1][:notes_index]
            else:
                value_label = cells[1]
            values[cells[0]] = {"Value Label": value_label, "Frequency": cells[2], "Percentage": cells[3], "Weighted Percentage": cells[4]}
    if "SAS Variable Name" in data_element_detail:
        data_element_detail['Valid Values'] = values
        data_element_details[data_element_detail["SAS Variable Name"]] = data_element_detail

Output as csv for manual labelling.  NOTE manual labelling step: labelling each field as either a health outcome or not.

In [3]:
pd.DataFrame.from_dict(data_element_details, orient="index").to_csv("../data/fields.csv")

Read fixed width file data positions from html

In [4]:
url = '../data/CDC - BRFSS 2022 Combined Landline and Telephone Multiple Data.html'
tables = pd.read_html(url)
df_cols = tables[0]
column_names = df_cols['Variable Name']
start_positions = df_cols['Starting Column']
widths = df_cols['Field Length']
colspecs = list(zip(start_positions - 1, start_positions - 1 + widths))

Load file using fixed width positions extracted above

In [5]:
df = pd.read_fwf('../data/LLCP2022.ASC', colspecs=colspecs, header=None, names=column_names)

MISSING CODE TO TRANSLATE NUMERIC VALUES AND SET COLUMN TYPES

Fill NAs

In [7]:
df = df.fillna(value='BLANK')
record_sum = len(df)

Map columns and values to friendly values.

In [10]:
def process_variable(var, values_map):
    if isinstance(var, int):
        result = str(var)
    elif isinstance(var, float):
        result = str(int(var))
    else:
        result = var

    if result in values_map and values_map[result].strip() != '':
        result = f"{result} - {values_map[result]}"
    
    return result.strip()

In [11]:
new_column_names = []
for column_name in df.columns:
    if column_name in data_element_details and 'Label' in data_element_details[column_name]:
        new_column_names.append(data_element_details[column_name]['Label'])
        values_map = {key: value['Value Label'] for key, value in data_element_details[column_name]['Valid Values'].items()}
        df[column_name] = df[column_name].map(lambda x: process_variable(x, values_map))
    else:
        new_column_names.append(column_name)
    
df.columns = new_column_names

In [12]:
df

Unnamed: 0,State FIPS Code,File Month,Interview Date,Interview Month,Interview Day,Interview Year,Final Disposition,Annual Sequence Number,Primary Sampling Unit,Correct telephone number?,...,Drink any alcoholic beverages in past 30 days,Computed drink-occasions-per-day,Binge Drinking Calculated Variable,Computed number of drinks of alcohol beverages per week,Heavy Alcohol Consumption Calculated Variable,Flu Shot Calculated Variable,Pneumonia Vaccination Calculated Variable,Ever been tested for HIV calculated variable,non_blank_count,non_blank_perc
0,1 - Alabama,1 - January,2032022,2 - February,3,2022,1100 - Completed Interview,2022000001,2022000001,"1 - YesGo to LL.02, PVTRESD1",...,2 - No,0 - No Drink-Occasions per day,1 - No,0 - Did not drink,1 - No,1 - Yes,2 - No,2 - No,177,0.558360
1,1 - Alabama,1 - January,2042022,2 - February,4,2022,1100 - Completed Interview,2022000002,2022000002,"1 - YesGo to LL.02, PVTRESD1",...,2 - No,0 - No Drink-Occasions per day,1 - No,0 - Did not drink,1 - No,2 - No,2 - No,2 - No,175,0.552050
2,1 - Alabama,1 - January,2022022,2 - February,2,2022,1100 - Completed Interview,2022000003,2022000003,"1 - YesGo to LL.02, PVTRESD1",...,2 - No,0 - No Drink-Occasions per day,1 - No,0 - Did not drink,1 - No,BLANK - Age < 65,BLANK - Age < 65,2 - No,171,0.539432
3,1 - Alabama,1 - January,2032022,2 - February,3,2022,1100 - Completed Interview,2022000004,2022000004,"1 - YesGo to LL.02, PVTRESD1",...,2 - No,0 - No Drink-Occasions per day,1 - No,0 - Did not drink,1 - No,9 - Dont know/Not Sure Or Refused/Missing,9 - Dont know/Not Sure Or Refused/Missing,2 - No,162,0.511041
4,1 - Alabama,1 - January,2022022,2 - February,2,2022,1100 - Completed Interview,2022000005,2022000005,"1 - YesGo to LL.02, PVTRESD1",...,1 - Yes,10,1 - No,140,1 - No,BLANK - Age < 65,BLANK - Age < 65,2 - No,176,0.555205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445127,78 - Virgin Islands,11 - November,12192022,12 - December,19,2022,1100 - Completed Interview,2022001527,2022001527,BLANK - Missing,...,7 - Dont know/Not Sure,900 - Dont know/Not Sure Or Refused/Missing,9 - Dont know/Refused/Missing,99900 - Dont know/Not sure/Refused/Missing,9 - Dont know/Refused/Missing,BLANK - Age < 65,BLANK - Age < 65,1 - Yes,177,0.558360
445128,78 - Virgin Islands,11 - November,12212022,12 - December,21,2022,1100 - Completed Interview,2022001528,2022001528,BLANK - Missing,...,2 - No,0 - No Drink-Occasions per day,1 - No,0 - Did not drink,1 - No,BLANK - Age < 65,BLANK - Age < 65,1 - Yes,160,0.504732
445129,78 - Virgin Islands,11 - November,11292022,11 - November,29,2022,1100 - Completed Interview,2022001529,2022001529,BLANK - Missing,...,7 - Dont know/Not Sure,900 - Dont know/Not Sure Or Refused/Missing,9 - Dont know/Refused/Missing,99900 - Dont know/Not sure/Refused/Missing,9 - Dont know/Refused/Missing,2 - No,2 - No,2 - No,169,0.533123
445130,78 - Virgin Islands,11 - November,12082022,12 - December,8,2022,1100 - Completed Interview,2022001530,2022001530,BLANK - Missing,...,2 - No,0 - No Drink-Occasions per day,1 - No,0 - Did not drink,1 - No,1 - Yes,1 - Yes,1 - Yes,165,0.520505


Export to CSV

MISSING CODE TO CREATE PICKLE OF DF INSTEAD OF CSV

In [13]:
df.to_csv('../data/processed.csv')