In [2]:
import pandas as pd
import json

In [9]:
#Dataset catalog from https://source.coop/harvard-lil/gov-data/web/data
datasets = pd.read_parquet('~/Downloads/datasets.parquet', engine='fastparquet')

In [16]:
datasets.loc[datasets.title == 'Behavioral Risk Factor Surveillance System (BRFSS) Prevalence Data (2010 and prior)'].name.iloc[0]

'behavioral-risk-factor-surveillance-system-brfss-prevalence-data-2010-and-prior'

In [3]:
v1 = pd.read_csv('~/Downloads/v1/data/files/rows.csv')

In [None]:
# https://catalog.data.gov/dataset/behavioral-risk-factor-surveillance-system-brfss-prevalence-data-2010-and-prior
# Metadata: https://catalog.data.gov/harvest/object/4151bcb5-de5b-4097-b69d-33b25bda0fd5

In [4]:
v_latest = pd.read_csv('~/Downloads/Behavioral_Risk_Factor_Surveillance_System__BRFSS__Prevalence_Data__2010_and_prior_.csv')

In [96]:
set(v_latest.columns) ^ set(v1.columns)

set()

In [97]:
categorical = ['Class', 'Topic', 'Question', 'Response', 'Break_Out_Category', 'Break_Out']

In [98]:
for cat in categorical:
    v_latest_vals = set(v_latest[cat].unique())
    v_1_vals = set(v1[cat].unique())
    non_overlap =  v_latest_vals ^ v_1_vals 
    if non_overlap != {}:
        print(f'Category: {cat}')
        print(f'Latest version: {[v for v in v_latest_vals if v not in v_1_vals]}')
        print(f'Version 1: {[v for v in v_1_vals if v not in v_latest_vals]}')
        print('----------------')
        

Category: Class
Latest version: []
Version 1: []
----------------
Category: Topic
Latest version: []
Version 1: []
----------------
Category: Question
Latest version: ['Sex of respondent']
Version 1: ['Gender of respondent']
----------------
Category: Response
Latest version: []
Version 1: []
----------------
Category: Break_Out_Category
Latest version: ['Sex']
Version 1: ['Gender']
----------------
Category: Break_Out
Latest version: []
Version 1: []
----------------


In [7]:
v1.groupby('Break_Out_Category').Year.count().sort_values()

Break_Out_Category
Overall                86487
Gender                160196
Education Attained    323122
Race/Ethnicity        364552
Household Income      399078
Age Group             439995
Name: Year, dtype: int64

In [8]:
v_latest.groupby('Break_Out_Category').Year.count().sort_values()

Break_Out_Category
Overall                86487
Sex                   160196
Education Attained    323122
Race/Ethnicity        364552
Household Income      399078
Age Group             439995
Name: Year, dtype: int64

In [12]:
for c in sorted(v1.columns):
    print(c)

BreakOutCategoryID
Break_Out
Break_Out_Category
BreakoutID
Class
ClassId
Confidence_limit_High
Confidence_limit_Low
DataSource
Data_Value_Footnote
Data_Value_Footnote_Symbol
Data_value
Data_value_type
Data_value_unit
Display_order
GeoLocation
LocationID
Locationabbr
Locationdesc
Question
QuestionID
Response
ResponseID
Sample_Size
Topic
TopicId
Year


In [16]:
v1.loc[v1.Break_Out_Category == 'Gender'].Break_Out.unique()

<StringArray>
['Female', 'Male']
Length: 2, dtype: str

In [17]:
v1.loc[v1.Break_Out_Category == 'Gender'].Class.unique()

<StringArray>
[                'Demographics', 'Overweight and Obesity (BMI)',
                  'Oral Health',        'Cholesterol Awareness',
          'Alcohol Consumption',    'Chronic Health Indicators',
                  'Tobacco Use',                'Health Status',
  'Health Care Access/Coverage',            'Physical Activity',
  'Colorectal Cancer Screening',       'Hypertension Awareness',
                 'Immunization',              'Prostate Cancer',
               'Women's Health',                       'Injury',
        'Fruits and Vegetables']
Length: 17, dtype: str

In [101]:
len(v1) == len(v_latest)

True

In [102]:
import hashlib

In [103]:
import numpy as np

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [108]:
columns = [c for c in v1.columns if c not in ['Break_Out_Category', 'Question']]
def hashing_fn(row):
    # If a row
    tail_values = []
    if row.Break_Out_Category == 'Sex':
        tail_values.append('Gender')
    else:
        tail_values.append(row.Break_Out_Category)
    if row.Question == 'Sex of respondent':
        tail_values.append('Gender of respondent')
    else:
        tail_values.append(row.Question)
    values = [row[c] for c in columns] + tail_values
    return hashlib.sha256(json.dumps(values, cls=NpEncoder).encode('utf-8')).hexdigest()
    

In [109]:
v1['id'] = v1.apply(hashing_fn, axis=1)

In [110]:
len(v1.id.unique()) == len(v1)
    

True

In [111]:
v_latest['id'] = v_latest.apply(hashing_fn, axis=1)

In [112]:
len(v_latest.id.unique()) == len(v_latest)

True

In [114]:
len(v1.loc[v1.id.isin(v_latest.id)]) == len(v1)

True

In [116]:
gender_set = v1.loc[(v1.Break_Out_Category == 'Gender') | (v1.Question == 'Gender of respondent')]

In [117]:
len(gender_set) / len(v1)

0.11003704685270915

In [118]:
sample_ids = v1.loc[~v1.id.isin(gender_set.id)].id.sample(n=1000)

In [119]:
sample_gender_ids = gender_set.id.sample(n=110)

In [124]:
v1_sample = v1.loc[v1.id.isin(list(sample_ids.values) + list(sample_gender_ids.values))]

In [126]:
len(v1_sample)

1110

In [127]:
v_latest_sample = v_latest.loc[v_latest.id.isin(v1_sample.id)]

In [128]:
len(v_latest_sample)

1110

In [130]:
v1_sample.sort_values(by='id').drop('id', axis=1).to_csv('~/Downloads/brfss_prevalence_data_2010_and_prior_v2024.csv', index=False)
v_latest_sample.sort_values(by='id').drop('id', axis=1).to_csv('~/Downloads/brfss_prevalence_data_2010_and_prior_v2025.csv', index=False)