In [177]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "notebook_connected"

headers_df = pd.read_json('../resources/generated/headers.json')
dataset = pd.read_parquet('../resources/generated/raw_dataset.parquet')
# 
# for key in dataset.columns:
#     dict = {int(k): v for k, v in headers_df[key]['possible_answers'].items() if k.isdigit()}
#     dataset[key] = dataset[key].map(dict)

target_name: str = 'CVDINFR4'

Let's start with the shape of our dataset:

In [178]:
dataset.shape

In [179]:
numerical = [ ]
for k, v in headers_df.to_dict().items():
    for a in v['possible_answers'].keys():
        if ' - ' in a:
            numerical.append(k)

for col in dataset.columns:
    if col not in numerical:
        dataset[col] = dataset[col].astype('category')

In [180]:
dataset.select_dtypes(include='category').columns

In [181]:
invalid_values = {
    'HHADULT': [88, 77, 99],
    'PHYSHLTH': [88, 77, 99],
    'MENTHLTH': [88, 77, 99],
    'POORHLTH': [88, 77, 99],
    'SLEPTIM1': [77, 99],
    'CHILDREN': [88, 99],
    'WEIGHT2': [7777, 9999],
    'HEIGHT3': [7777, 9999],
    'LCSFIRST': [777, 888, 999],
    'LCSLAST': [777, 999],
    'LCSNUMCG': [777, 999],
    'ALCDAY4': [777, 888, 999],
    'AVEDRNK3': [88, 77, 99],
    'DRNK3GE5': [88, 77, 99],
    'MAXDRNKS': [77, 99],
    'FLSHTMY3': [777777, 999999],
    'HIVTSTD3': [777777, 999999],
    'CHKHEMO3': [88, 98, 77, 99],
    'HPVADSHT': [77, 99],
    'COVIDFS1': [777777, 999999],
    'COVIDSE1': [777777, 999999],
    'COPDSMOK': [88, 77, 99],
    'CNCRAGE': [98, 99],
    'MARIJAN1': [88, 77, 99],
    'DROCDY4_': [900],
    '_DRNKWK2': [99900]
}

def replace_with_none(df, column, invalid_list):
    df[column] = df[column].replace(invalid_list, None)
    return df

for column, invalids in invalid_values.items():
    dataset = replace_with_none(dataset, column, invalids)

dataset['MENTHLTH'].value_counts(dropna=False)

In [182]:
def possible_answers(question_key: str) -> dict:
    return {
        int(k): v for k, v in headers_df[question_key]['possible_answers'].items() if k.isdigit()
    }

def count_of(question_key: str, dataset: pd.DataFrame = dataset) -> dict:
    print(f'possible answers: {possible_answers(question_key)}')
    return dataset[question_key].map(possible_answers(question_key)).value_counts()

def plot_bar_of_count(question_key: str, dataset: pd.DataFrame = dataset):
    value_counts = count_of(question_key, dataset)

    fig = go.Figure(
        data=[go.Bar(x=value_counts.index, y=value_counts.values)]
    )

    fig.update_layout(
        title=headers_df[question_key]['label'],
        xaxis_title='Categories',
        yaxis_title='Count',
        width=1100,
        height=600,
    )

    fig.show()


def correlation_plot(df: pd.DataFrame = dataset):
    correlation_matrix = df.corr(method='spearman')

    fig = px.imshow(correlation_matrix,
                    text_auto=True,
                    aspect="auto",
                    color_continuous_scale='RdBu_r',  # Red-Blue color map, reversed
                    labels={'x': "Feature", 'y': "Feature", 'color': "Correlation Coefficient"},
                    x=correlation_matrix.columns,
                    y=correlation_matrix.columns)

    fig.update_layout(title="BRFSS Correlation Matrix",
                      xaxis_title="Features",
                      yaxis_title="Features")
    
    fig.show()

def pie_plot(question_key: str):
    value_counts = count_of(question_key)
    
    fig = go.Figure(
        data=[go.Pie(labels=value_counts.index, values=value_counts.values)]
    )

    fig.update_layout(
        title=headers_df[question_key]['label'],
        width=1100,
        height=600,
        legend={
            'orientation': 'h',
            'xanchor': 'center',
            'x': 0.5,
            'y': -0.1
        }
    )
    
    fig.show()

In [183]:
dataset['SLEPTIM1'].value_counts()

In [184]:
columns = [
    'CVDINFR4',
    # '_AGE_G',
    # 'BIRTHSEX',
    # 'DIABETE4',
    # '_DENVST3',
]

for column in columns:
    plot_bar_of_count(column)


In [185]:
cols = [target_name, 'DIABETE4']

plot_bar_of_count('DIABETE4', dataset=dataset[dataset[target_name] == 1])
plot_bar_of_count('DIABETE4', dataset=dataset[dataset[target_name] == 2])

cols = [target_name, 'BIRTHSEX']

plot_bar_of_count('BIRTHSEX', dataset=dataset[dataset[target_name] == 1])
plot_bar_of_count('BIRTHSEX', dataset=dataset[dataset[target_name] == 2])

cols = [target_name, '_AGE_G']

plot_bar_of_count('_AGE_G', dataset=dataset[dataset[target_name] == 1])
plot_bar_of_count('_AGE_G', dataset=dataset[dataset[target_name] == 2])

cols = [target_name, 'SLEPTIM1']

plot_bar_of_count('SLEPTIM1', dataset=dataset[dataset[target_name] == 1])
plot_bar_of_count('SLEPTIM1', dataset=dataset[dataset[target_name] == 2])


In [186]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
from statsmodels.stats.contingency_tables import Table2x2

# Example data
df = dataset

contingency_table = pd.crosstab(df[target_name], df['_AGE_G'])

fig = px.imshow(contingency_table,
                text_auto=True, 
                labels={'x': '_AGE_G', 'y': target_name, 'color': "Count"},
                x=contingency_table.columns,
                y=contingency_table.index,
                title="Heatmap of Contingency Table")
fig.update_xaxes(side="top")
fig.show()

