### Import Libraries

In [1]:
import requests
import pandas as pd
import numpy as np
import subprocess
import sys
import os
from urllib.request import urlretrieve
from zipfile import ZipFile

### Define our functions to handle the downloading of our BFRSS datafile

In [4]:
def download_file(download_f, filename, force=False):
    """
    Downloads a file using a dynamic function.

    :param download_f: Function to be used for the actual download operation
    :param filename: File name for saving
    :param force: Whether to overwrite an existing file
    """
    if force or not os.path.exists(filename):
        download_f(filename)

In [5]:
def download_cdc_survey(filename):
    """
    Downloads the cdc survey dataset.

    :param filename: File name for saving
    """
    def download(filename):
        """
        Actual download logic for the personal cdc survey dataset

        :param filename: File name for saving
        """
        url = 'https://www.cdc.gov/brfss/annual_data/2022/files/LLCP2022ASC.zip'
        urlretrieve(url, filename)
    download_file(download, filename)
    with ZipFile(filename, 'r') as zObject:
        zObject.extractall(path='./cdc_survey_unzipped/')


In [6]:
cdc_survey_csv_filename = 'cdc_survey.csv'
download_cdc_survey(cdc_survey_csv_filename)


### Parse the BFRSS data

In [7]:
state_code_to_abbr = {
    1: 'AL', 2: 'AK', 4: 'AZ', 5: 'AR', 6: 'CA', 8: 'CO', 9: 'CT', 10: 'DE', 11: 'DC', 12: 'FL', 13: 'GA', 15: 'HI', 16: 'ID', 17: 'IL', 18: 'IN', 19: 'IA', 20: 'KS',
    21: 'KY', 22: 'LA', 23: 'ME', 24: 'MD', 25: 'MA', 26: 'MI', 27: 'MN', 28: 'MS', 29: 'MO', 30: 'MT', 31: 'NE', 32: 'NV', 33: 'NH', 34: 'NJ', 35: 'NM', 36: 'NY',
    37: 'NC', 38: 'ND', 39: 'OH', 40: 'OK', 41: 'OR', 42: 'PA', 44: 'RI', 45: 'SC', 46: 'SD', 47: 'TN', 48: 'TX', 49: 'UT', 50: 'VT', 51: 'VA', 53: 'WA', 54: 'WV', 55: 'WI', 56: 'WY'
    }

def parse_cdc_survey():
  """
  Reads the CDC survey input file and parses the contents using ASCII format.

  :return: CDC survey dataframe.
  """
  rows = []
  with open('./cdc_survey_unzipped/LLCP2022.ASC ', 'r') as f:
    count = 0
    for line in f:
      row = {}
      state_code = int(line[:2])
      interview_completed = line[31:35] == '1100'
      if interview_completed and state_code in state_code_to_abbr:
        row['State Abbr.'] = state_code_to_abbr[state_code]
        row['Interview Year'] = line[22:26]

        age_older_18_landline = line[67]
        if age_older_18_landline == '1':
          row['Is Adult Landline'] = True
        elif age_older_18_landline == '2':
          row['Is Adult Landline'] = False

        gender = line[1979]
        if gender == '1':
          row['Gender'] = 'Male'
        elif gender == '2':
          row['Gender'] = 'Female'

        num_of_adults_landline = line[69:71].strip()
        if num_of_adults_landline:
          row['Num of Adults Landline'] = int(num_of_adults_landline)


        age_older_18_cell = line[80]
        if age_older_18_cell == '1':
          row['Is Adult Cell'] = True

        num_of_adults_cell = line[88:90].strip()
        if num_of_adults_cell and int(num_of_adults_cell) <= 76:
          row['Num of Adults Cell'] = int(num_of_adults_cell)

        general_health = line[100].strip()
        if general_health == '1':
          row['General Health'] = 'Excellent'
        elif general_health == '2':
          row['General Health'] = 'Very good'
        elif general_health == '3':
          row['General Health'] = 'Good'
        elif general_health == '4':
          row['General Health'] = 'Fair'
        elif general_health == '5':
          row['General Health'] = 'Poor'

        health_care_access = line[107:109].strip()
        if health_care_access:
          health_care_access_code = int(health_care_access)
          if health_care_access_code == 1:
            row['Health Care Access'] = 'from_employer'
          elif health_care_access_code == 2:
            row['Health Care Access'] = 'from_own_pocket'
          elif health_care_access_code == 3:
            row['Health Care Access'] = 'medicare'
          elif health_care_access_code == 4:
            row['Health Care Access'] = 'medigap'
          elif health_care_access_code == 5:
            row['Health Care Access'] = 'medicaid'
          elif health_care_access_code == 6:
            row['Health Care Access'] = 'children_health_insurance_program'
          elif health_care_access_code == 7:
            row['Health Care Access'] = 'military'
          elif health_care_access_code == 8:
            row['Health Care Access'] = 'indian_health'
          elif health_care_access_code == 9:
            row['Health Care Access'] = 'state_sponsored'
          elif health_care_access_code == 10:
            row['Health Care Access'] = 'other_government'
          elif health_care_access_code == 88:
            row['Health Care Access'] = 'no_coverage'

        could_afford_doctor = line[110].strip()
        if could_afford_doctor:
          could_afford_doctor_code = int(could_afford_doctor)
          if could_afford_doctor_code == 1:
            row['Could Afford Doctor'] = True
          elif could_afford_doctor_code == 2:
            row['Could Afford Doctor'] = False

        how_many_years_since_last_checkup = line[111].strip()
        if how_many_years_since_last_checkup:
          how_many_years_since_last_checkup_code = int(how_many_years_since_last_checkup)
          if how_many_years_since_last_checkup_code == 1:
            row['Years Since Last Checkup'] = 'within_past_year'
          elif how_many_years_since_last_checkup_code == 2:
            row['Years Since Last Checkup'] = 'within_past_two_years'
          elif how_many_years_since_last_checkup_code == 3:
            row['Years Since Last Checkup'] = 'within_past_five_years'
          elif how_many_years_since_last_checkup_code == 4:
            row['Years Since Last Checkup'] = 'five_or_more_years'

        exercise_in_past_30_days = line[112].strip()
        if exercise_in_past_30_days:
          exercise_in_past_30_days_code = int(exercise_in_past_30_days)
          if exercise_in_past_30_days_code == 1:
            row['Exercise in Past 30 Days'] = True
          elif exercise_in_past_30_days_code == 2:
            row['Exercise in Past 30 Days'] = False

        hours_of_sleeping = line[113:115].strip()
        if hours_of_sleeping:
          hours_of_sleeping_code = int(hours_of_sleeping)
          if hours_of_sleeping_code <= 24:
            row['Hours of Sleeping'] = hours_of_sleeping_code

        how_many_years_since_last_dentist_visit = line[115].strip()
        if how_many_years_since_last_dentist_visit:
          how_many_years_since_last_dentist_visit_code = int(how_many_years_since_last_dentist_visit)
          if how_many_years_since_last_dentist_visit_code == 1:
            row['Years Since Last Dentist Visit'] = 'within_past_year'
          elif how_many_years_since_last_dentist_visit_code == 2:
            row['Years Since Last Dentist Visit'] = 'within_past_two_years'
          elif how_many_years_since_last_dentist_visit_code == 3:
            row['Years Since Last Dentist Visit'] = 'within_past_five_years'
          elif how_many_years_since_last_dentist_visit_code == 4:
            row['Years Since Last Dentist Visit'] = 'five_or_more_years'

        heart_attack = line[117].strip()
        if heart_attack:
          if heart_attack == '1':
            row['Heart Attack'] = True
          elif heart_attack == '2':
            row['Heart Attack'] = False

        heart_disease = line[118].strip()
        if heart_disease:
          if heart_disease == '1':
            row['Heart Disease'] = True
          elif heart_disease == '2':
            row['Heart Disease'] = False

        stroke = line[119].strip()
        if stroke:
          if stroke == '1':
            row['Stroke'] = True
          elif stroke == '2':
            row['Stroke'] = False

        asthma = line[1908].strip()
        if asthma:
          if asthma == '1':
            row['Asthma'] = False
          elif asthma == '2':
            row['Asthma'] = True

        asthma_in_effect = line[1909].strip()
        if asthma_in_effect:
          if asthma_in_effect == '2':
            row['Asthma in Effect'] = True
          elif asthma_in_effect == '1':
            row['Asthma in Effect'] = False

        depression = line[125].strip()
        if depression:
          if depression == '1':
            row['Depression'] = True
          elif depression == '2':
            row['Depression'] = False

        kidney_disease = line[126].strip()
        if kidney_disease:
          if kidney_disease == '1':
            row['Kidney Disease'] = True
          elif kidney_disease == '2':
            row['Kidney Disease'] = False

        arthritis = line[1911].strip()
        if arthritis:
          if arthritis == '1':
            row['Arthritis'] = True
          elif arthritis == '2':
            row['Arthritis'] = False

        diabetes = line[128].strip()
        if diabetes:
          if diabetes == '1':
            row['Diabetes'] = True
          elif diabetes in ['2', '3', '4']:
            row['Diabetes'] = False

        weight_in_pounds = line[188:192].strip()
        if weight_in_pounds:
          weight_in_pounds_code = int(weight_in_pounds)
          if weight_in_pounds_code >= 50 and weight_in_pounds_code <= 776:
            row['Weight in Pounds'] = weight_in_pounds_code

        height_in_inches = line[1986:1989].strip()
        if height_in_inches:
          height_in_inches_code = int(height_in_inches)
          row['Height in Inches'] = height_in_inches_code


        smoking = line[223].strip()
        if smoking:
          if smoking == '1':
            row['Smoking'] = 'every_day'
          elif smoking == '2':
            row['Smoking'] = 'some_days'
          elif smoking == '3':
            row['Smoking'] = 'never'

        cigarettes_per_day = line[232:235].strip()
        if cigarettes_per_day:
          cigarettes_per_day_code = int(cigarettes_per_day)
          if cigarettes_per_day_code <= 300:
            row['Cigarettes per Day'] = cigarettes_per_day_code

        drinks_last_30_day = line[241:243].strip()
        if drinks_last_30_day:
          drinks_last_30_day_code = int(drinks_last_30_day)
          if drinks_last_30_day_code <= 76:
            row['Drinks in Last 30 Days'] = drinks_last_30_day_code

        pre_diabetes = line[269].strip()
        if pre_diabetes:
          if pre_diabetes == '1':
            row['Prediabetes'] = True
          elif pre_diabetes in ['2', '3']:
            row['Prediabetes'] = False

        diabetes = line[270].strip()
        if diabetes:
          if diabetes in ['1', '2']:
            row['Diabetes'] = True

        shortness_of_breath = line[305].strip()
        if shortness_of_breath:
          if shortness_of_breath == '1':
            row['Shortness of Breath'] = True
          elif shortness_of_breath == '2':
            row['Shortness of Breath'] = False

        life_satisfaction = line[360].strip()
        if life_satisfaction:
          if life_satisfaction == '1':
            row['Life Satisfaction'] = 'very_satisfied'
          elif life_satisfaction == '2':
            row['Life Satisfaction'] = 'satisfied'
          elif life_satisfaction == '3':
            row['Life Satisfaction'] = 'dissatisfied'
          elif life_satisfaction == '4':
            row['Life Satisfaction'] = 'very_dissatisfied'

        ethnicity = line[1975].strip()
        if ethnicity:
          if ethnicity == '1':
            row['Ethnicity'] = 'White'
          elif ethnicity == '2':
            row['Ethnicity'] = 'Black'
          elif ethnicity == '3':
            row['Ethnicity'] = 'American Indian and Alaskan Native'
          elif ethnicity in ['4', '5']:
            row['Ethnicity'] = 'Asian and Pacific Islander'
          elif ethnicity == '8':
            row['Ethnicity'] = 'Hispanic'
          elif ethnicity == '7':
            row['Ethnicity'] = 'Multiracial, non-Hispanic'

        metropolitan = line[1401].strip()
        if metropolitan:
          if metropolitan == '1':
            row['Metropolitan'] = True
          elif metropolitan == '2':
            row['Metropolitan'] = False

        age_group = line[1985].strip()
        if age_group:
          if age_group == '1':
            row['Age Group'] = 'between_18_and_24'
          elif age_group == '2':
            row['Age Group'] = 'between_25_and_34'
          elif age_group == '3':
            row['Age Group'] = 'between_35_and_44'
          elif age_group == '4':
            row['Age Group'] = 'between_45_and_54'
          elif age_group == '5':
            row['Age Group'] = 'between_55_and_64'
          elif age_group == '6':
            row['Age Group'] = 'older_than_65'

        bmi_category = line[2001].strip()
        if bmi_category:
          if bmi_category == '1':
            row['BMI Category'] = 'underweight'
          elif bmi_category == '2':
            row['BMI Category'] = 'normal_weight'
          elif bmi_category == '3':
            row['BMI Category'] = 'over_weight'
          elif bmi_category == '4':
            row['BMI Category'] = 'obese'

        education = line[2004].strip()
        if education:
          if education == '1':
            row['Education'] = 'did_not_graduate_high_school'
          elif education == '2':
            row['Education'] = 'graduated_high_school'
          elif education == '3':
            row['Education'] = 'attended_college'
          elif education == '4':
            row['Education'] = 'graduated_college'

        income_level = line[2005].strip()
        if income_level:
          if income_level == '1':
            row['Income Level'] = 'less_than_15K'
          elif income_level == '2':
            row['Income Level'] = 'between_15K_and_25K'
          elif income_level == '3':
            row['Income Level'] = 'between_25K_and_35K'
          elif income_level == '4':
            row['Income Level'] = 'between_35K_and_50K'
          elif income_level == '5':
            row['Income Level'] = 'between_50K_and_100K'
          elif income_level == '6':
            row['Income Level'] = 'between_100K_and_200K'
          elif income_level == '7':
            row['Income Level'] = 'more_than_200K'

        heavy_alcohol_consumption = line[2047].strip()
        if heavy_alcohol_consumption:
          if heavy_alcohol_consumption == '1':
            row['Heavy Alcohol Consumption'] = False
          elif heavy_alcohol_consumption == '2':
            row['Heavy Alcohol Consumption'] = True

        rows.append(row)

    df = pd.DataFrame.from_dict(rows).astype({'Is Adult Landline': bool, 'Could Afford Doctor': bool, 'Exercise in Past 30 Days': bool, 'Heart Attack': bool, 'Heart Disease': bool,
                                              'Stroke': bool, 'Asthma': bool, 'Asthma in Effect': bool, 'Depression': bool, 'Kidney Disease': bool, 'Arthritis': bool,
                                              'Diabetes': bool, 'Metropolitan': bool, 'Heavy Alcohol Consumption': bool, 'Prediabetes': bool, 'Shortness of Breath': bool
                                              }

                                             )

    return df

In [8]:
df_cdc_survey = parse_cdc_survey()
df_cdc_survey.head(100)

Unnamed: 0,State Abbr.,Interview Year,Is Adult Landline,Gender,Num of Adults Landline,General Health,Could Afford Doctor,Years Since Last Checkup,Exercise in Past 30 Days,Hours of Sleeping,...,BMI Category,Income Level,Smoking,Cigarettes per Day,Drinks in Last 30 Days,Prediabetes,Years Since Last Dentist Visit,Is Adult Cell,Num of Adults Cell,Shortness of Breath
0,AL,2022,True,Female,2.0,Very good,False,within_past_year,False,8.0,...,,,,,,True,,,,True
1,AL,2022,True,Female,2.0,Excellent,False,,False,6.0,...,over_weight,between_25K_and_35K,,,,True,,,,True
2,AL,2022,True,Female,1.0,Very good,False,within_past_year,True,5.0,...,over_weight,between_100K_and_200K,,,,True,,,,True
3,AL,2022,True,Female,3.0,Excellent,False,within_past_year,True,7.0,...,normal_weight,,some_days,2.0,,True,,,,True
4,AL,2022,True,Female,2.0,Fair,False,within_past_year,True,9.0,...,normal_weight,between_25K_and_35K,,,2.0,True,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,AL,2022,True,Male,2.0,Good,False,within_past_year,False,7.0,...,obese,,never,10.0,,True,,,,True
96,AL,2022,True,Female,1.0,Very good,False,within_past_year,True,8.0,...,over_weight,between_25K_and_35K,never,10.0,,True,,,,True
97,AL,2022,True,Female,1.0,Good,False,within_past_year,True,8.0,...,obese,less_than_15K,never,5.0,,True,,,,True
98,AL,2022,True,Male,2.0,Excellent,False,within_past_year,True,6.0,...,obese,between_50K_and_100K,,,,True,,,,True


### Export the parsed dataframe as a csv file

In [9]:
# Step to export the dataframe to a CSV file
def export_to_csv(df, output_filename):
    """
    Exports a dataframe to a CSV file.

    :param df: The dataframe to export.
    :param output_filename: The name of the output CSV file.
    """
    df.to_csv(output_filename, index=False)

# Call the function to export the DataFrame
export_to_csv(df_cdc_survey, 'cdc_survey_cleaned.csv')