In [40]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

headers_df = pd.read_json('../resources/generated/headers.json')
dataset = pd.read_parquet('../resources/generated/raw_dataset.parquet')
# 
# for key in dataset.columns:
#     dict = {int(k): v for k, v in headers_df[key]['possible_answers'].items() if k.isdigit()}
#     dataset[key] = dataset[key].map(dict)

target_name: str = 'CVDINFR4'

Let's start with the shape of our dataset:

In [41]:
dataset.shape

(445132, 322)

In [42]:
numerical = [ ]
for k, v in headers_df.to_dict().items():
    for a in v['possible_answers'].keys():
        if ' - ' in a:
            numerical.append(k)

for col in dataset.columns:
    if col not in numerical:
        dataset[col] = dataset[col].astype('category')

In [43]:
dataset.select_dtypes(include='category').columns

Index(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENM1',
       ...
       '_RFSMOK3', '_CURECI2', '_SMOKGRP', '_LCSREC', 'DRNKANY6', '_RFBING6',
       '_RFDRHV8', '_FLSHOT7', '_PNEUMO3', '_AIDTST4'],
      dtype='object', length=284)

In [44]:
dataset.select_dtypes(include='number').columns

Index(['NUMADULT', 'NUMMEN', 'NUMWOMEN', 'HHADULT', 'PHYSHLTH', 'MENTHLTH',
       'POORHLTH', 'SLEPTIM1', 'CHILDREN', 'WEIGHT2', 'HEIGHT3', 'LCSFIRST',
       'LCSLAST', 'LCSNUMCG', 'ALCDAY4', 'AVEDRNK3', 'DRNK3GE5', 'MAXDRNKS',
       'FLSHTMY3', 'HIVTSTD3', 'CHKHEMO3', 'HPVADSHT', 'COVIDFS1', 'COVIDSE1',
       'COPDSMOK', 'CNCRAGE', 'MARIJAN1', '_AGE80', 'HTIN4', 'HTM4', 'WTKG3',
       '_BMI5', '_YRSSMOK', '_PACKDAY', '_PACKYRS', '_YRSQUIT', 'DROCDY4_',
       '_DRNKWK2'],
      dtype='object')

In [45]:
def possible_answers(question_key: str) -> dict:
    return {
        int(k): v for k, v in headers_df[question_key]['possible_answers'].items() if k.isdigit()
    }

def count_of(question_key: str, dataset: pd.DataFrame = dataset) -> dict:
    print(f'possible answers: {possible_answers(question_key)}')
    return dataset[question_key].map(possible_answers(question_key)).value_counts()

def plot_bar_of_count(question_key: str, dataset: pd.DataFrame = dataset):
    value_counts = count_of(question_key, dataset)

    fig = go.Figure(
        data=[go.Bar(x=value_counts.index, y=value_counts.values)]
    )

    fig.update_layout(
        title=headers_df[question_key]['label'],
        xaxis_title='Categories',
        yaxis_title='Count',
        width=1100,
        height=600,
    )

    fig.show()


def correlation_plot(df: pd.DataFrame = dataset):
    correlation_matrix = df.corr(method='spearman')

    fig = px.imshow(correlation_matrix,
                    text_auto=True,
                    aspect="auto",
                    color_continuous_scale='RdBu_r',  # Red-Blue color map, reversed
                    labels={'x': "Feature", 'y': "Feature", 'color': "Correlation Coefficient"},
                    x=correlation_matrix.columns,
                    y=correlation_matrix.columns)

    fig.update_layout(title="BRFSS Correlation Matrix",
                      xaxis_title="Features",
                      yaxis_title="Features")
    
    fig.show()

def pie_plot(question_key: str):
    value_counts = count_of(question_key)
    
    fig = go.Figure(
        data=[go.Pie(labels=value_counts.index, values=value_counts.values)]
    )

    fig.update_layout(
        title=headers_df[question_key]['label'],
        width=1100,
        height=600,
        legend={
            'orientation': 'h',
            'xanchor': 'center',
            'x': 0.5,
            'y': -0.1
        }
    )
    
    fig.show()

In [49]:
dataset['SLEPTIM1'].value_counts()

SLEPTIM1
7.0     132927
8.0     125442
6.0      95880
5.0      30122
9.0      21210
4.0      12433
10.0     10459
77.0      4792
3.0       3260
12.0      3004
2.0       1549
1.0       1154
11.0       686
99.0       658
16.0       329
15.0       317
14.0       295
18.0       168
13.0       165
20.0       143
24.0        52
17.0        27
22.0        19
23.0        18
19.0        16
21.0         4
Name: count, dtype: int64

In [46]:
columns = [
    'CVDINFR4',
    '_AGE_G',
    'BIRTHSEX',
    'DIABETE4',
    '_DENVST3',
]

for column in columns:
    plot_bar_of_count(column)


possible answers: {1: 'Yes', 2: 'No', 7: 'Don’t know/Not sure', 9: 'Refused'}


possible answers: {1: 'Age 18 to 24Notes: 18 <= _IMPAGE <= 24', 2: 'Age 25 to 34Notes: 25 <= _IMPAGE <= 34', 3: 'Age 35 to 44Notes: 35 <= _IMPAGE <= 44', 4: 'Age 45 to 54Notes: 45 <= _IMPAGE <= 54', 5: 'Age 55 to 64Notes: 55 <= _IMPAGE <= 64', 6: 'Age 65 or olderNotes: _IMPAGE => 65'}


possible answers: {1: 'Male', 2: 'Female', 7: 'Don’t know/Not Sure', 9: 'Refused'}


possible answers: {1: 'Yes', 2: 'Yes, but female told only during pregnancy—Go to Section 08.01 AGE', 3: 'No—Go to Section 08.01 AGE', 4: 'No, pre-diabetes or borderline diabetes—Go to Section 08.01 AGE', 7: 'Don’t know/Not Sure—Go to Section 08.01 AGE', 9: 'Refused—Go to Section 08.01 AGE'}


possible answers: {1: 'YesNotes: LASTDEN4=1', 2: 'NoNotes: LASTDEN4=2 or 3 or 4', 9: 'Don’t know/Not Sure Or Refused/MissingNotes: LASTDEN4=7 or 9 or Missing'}


In [47]:
cols = [target_name, 'DIABETE4']

plot_bar_of_count('DIABETE4', dataset=dataset[dataset[target_name] == 1])
plot_bar_of_count('DIABETE4', dataset=dataset[dataset[target_name] == 2])

cols = [target_name, 'BIRTHSEX']

plot_bar_of_count('BIRTHSEX', dataset=dataset[dataset[target_name] == 1])
plot_bar_of_count('BIRTHSEX', dataset=dataset[dataset[target_name] == 2])

cols = [target_name, '_AGE_G']

plot_bar_of_count('_AGE_G', dataset=dataset[dataset[target_name] == 1])
plot_bar_of_count('_AGE_G', dataset=dataset[dataset[target_name] == 2])

cols = [target_name, 'SLEPTIM1']

plot_bar_of_count('SLEPTIM1', dataset=dataset[dataset[target_name] == 1])
plot_bar_of_count('SLEPTIM1', dataset=dataset[dataset[target_name] == 2])


possible answers: {1: 'Yes', 2: 'Yes, but female told only during pregnancy—Go to Section 08.01 AGE', 3: 'No—Go to Section 08.01 AGE', 4: 'No, pre-diabetes or borderline diabetes—Go to Section 08.01 AGE', 7: 'Don’t know/Not Sure—Go to Section 08.01 AGE', 9: 'Refused—Go to Section 08.01 AGE'}


possible answers: {1: 'Yes', 2: 'Yes, but female told only during pregnancy—Go to Section 08.01 AGE', 3: 'No—Go to Section 08.01 AGE', 4: 'No, pre-diabetes or borderline diabetes—Go to Section 08.01 AGE', 7: 'Don’t know/Not Sure—Go to Section 08.01 AGE', 9: 'Refused—Go to Section 08.01 AGE'}


possible answers: {1: 'Male', 2: 'Female', 7: 'Don’t know/Not Sure', 9: 'Refused'}


possible answers: {1: 'Male', 2: 'Female', 7: 'Don’t know/Not Sure', 9: 'Refused'}


possible answers: {1: 'Age 18 to 24Notes: 18 <= _IMPAGE <= 24', 2: 'Age 25 to 34Notes: 25 <= _IMPAGE <= 34', 3: 'Age 35 to 44Notes: 35 <= _IMPAGE <= 44', 4: 'Age 45 to 54Notes: 45 <= _IMPAGE <= 54', 5: 'Age 55 to 64Notes: 55 <= _IMPAGE <= 64', 6: 'Age 65 or olderNotes: _IMPAGE => 65'}


possible answers: {1: 'Age 18 to 24Notes: 18 <= _IMPAGE <= 24', 2: 'Age 25 to 34Notes: 25 <= _IMPAGE <= 34', 3: 'Age 35 to 44Notes: 35 <= _IMPAGE <= 44', 4: 'Age 45 to 54Notes: 45 <= _IMPAGE <= 54', 5: 'Age 55 to 64Notes: 55 <= _IMPAGE <= 64', 6: 'Age 65 or olderNotes: _IMPAGE => 65'}


possible answers: {77: 'Don’t know/Not Sure', 99: 'Refused'}


possible answers: {77: 'Don’t know/Not Sure', 99: 'Refused'}


In [48]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from statsmodels.stats.contingency_tables import Table2x2

# Example data
df = dataset

contingency_table = pd.crosstab(df[target_name], df['_AGE_G'])

fig = px.imshow(contingency_table,
                text_auto=True, 
                labels={'x': '_AGE_G', 'y': target_name, 'color': "Count"},
                x=contingency_table.columns,
                y=contingency_table.index,
                title="Heatmap of Contingency Table")
fig.update_xaxes(side="top")
fig.show()

