Shape do dataset original:

In [1]:
import pandas as pd

raw = pd.read_parquet('../resources/processed/raw_dataset.parquet')
raw.shape

(445132, 322)

Informação sobre a coluna target:

In [2]:
raw['CVDINFR4'].value_counts() 

CVDINFR4
2.0    416959
1.0     25108
7.0      2731
9.0       330
Name: count, dtype: int64

Shape do dataset de treinamento

In [3]:
pd.read_parquet('../resources/processed/train.parquet').shape

(353653, 41)

Shape do dataset de teste

In [4]:
pd.read_parquet('../resources/processed/test.parquet').shape

(88414, 41)

Shape do dataset de análise

In [5]:
df = pd.read_parquet('../resources/processed/analysis.parquet')
df.shape

(35366, 41)

Inputando informações dos valores nas colunas para facilitar a análise

In [6]:
headers = pd.read_json('../resources/processed/selected_headers.json')

categorical_cols = df.select_dtypes(include=['object', 'category']).columns

for col, mapping in headers.items():
    if col in categorical_cols:
        df[col] = df[col].map(lambda x: mapping['possible_answers'].get(str(x), str(x)))

df['target'] = df['target'].map({ 'True': 'Yes', 'False': 'No' })

df

Unnamed: 0,LSATISFY,MAXDRNKS,CDHELP,LCSFIRST,LCSLAST,_IMPRACE,_RFHLTH,_PHYS14D,_MENT14D,_HLTHPLN,...,DIABETE4,DECIDE,DIFFWALK,DIFFDRES,DIFFALON,CNCRAGE,CNCRTYP2,CHCOCNC1,CVDCRHD4,target
0,Satisfied,3.0,,18.0,,"White, Non-Hispanic",Good,Zero days when physical health not good,1-13 days when mental health not good,Have some form of insurance,...,No,No,No,No,No,,,No,No,No
1,Satisfied,,,,,Hispanic,Fair or Poor,14+ days when physical health not good,1-13 days when mental health not good,Have some form of insurance,...,No,Yes,Yes,Don't know/Not Sure,Yes,,,No,No,No
2,,,,,,"White, Non-Hispanic",Good,Zero days when physical health not good,Zero days when mental health not good,Have some form of insurance,...,No,No,No,No,No,,,No,No,No
3,,,,18.0,28.0,"White, Non-Hispanic",Good,Don't know / Refused / Missing,Zero days when mental health not good,Have some form of insurance,...,No,No,No,No,No,,,No,No,No
4,Satisfied,1.0,,17.0,,"White, Non-Hispanic",Good,1-13 days when physical health not good,14+ days when mental health not good,Have some form of insurance,...,No,No,No,No,No,,,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35361,,2.0,,17.0,59.0,"White, Non-Hispanic",Good,Zero days when physical health not good,1-13 days when mental health not good,Have some form of insurance,...,No,No,No,No,No,,,No,No,No
35362,,,,,,"White, Non-Hispanic",Fair or Poor,Zero days when physical health not good,Zero days when mental health not good,"Don't know, refused or missing insurance response",...,No,No,No,No,No,,,No,No,No
35363,Very satisfied,2.0,,,,"White, Non-Hispanic",Good,14+ days when physical health not good,14+ days when mental health not good,Have some form of insurance,...,No,No,No,No,No,,,No,No,No
35364,Very satisfied,,,,,"Asian, Non-Hispanic",Good,Zero days when physical health not good,Zero days when mental health not good,Do not have some form of health insurance,...,No,No,No,No,No,,,No,No,No


In [7]:
df['_BMI5CAT']

0        Normal Weight
1        Normal Weight
2          Underweight
3                Obese
4           Overweight
             ...      
35361            Obese
35362            Obese
35363    Normal Weight
35364            Obese
35365             <NA>
Name: _BMI5CAT, Length: 35366, dtype: object

In [11]:
import plotly.express as px

from util.plots import *

columns = [
    ('_AGE_G', 'asc', 'v'), 
    ('_BMI5CAT', [
        'Underweight', 'Normal Weight', 'Overweight', 'Obese'
    ], 'v'),
    ('_RFHLTH', 'desc', 'v'),
    ('DIABETE4', 'desc', 'v'),
    ('CVDCRHD4', 'desc', 'v'),
    ('CHCKDNY2', 'desc', 'v')
]

# smokgrp = df['_SMOKGRP'].value_counts()
# smokgrp = smokgrp.drop(labels='<NA>').sort_values()

smokgrp = df[df['_SMOKGRP'] != '<NA>']
smokgrp = smokgrp.groupby(['target', '_SMOKGRP']).size().reset_index(name='count')
total_count = smokgrp['count'].sum()
smokgrp['percentage'] = (smokgrp['count'] / total_count) * 100

fig = px.bar(
    smokgrp.sort_values(by=['target', 'percentage']),  
    y='_SMOKGRP', 
    x='percentage',
    color='target',
    orientation='h',
    color_discrete_sequence=[strongcyan, gold],
    labels={
        # '_SMOKGRP': f"_SMOKGRP",
        "target": "Have you ever had a heart attack?",
        'percentage': f'Percentage by _SMOKGRP - Total: {total_count}'
    },
)

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.update_traces(
    # hoverinfo='label+percent', 
    text=smokgrp.values,
    # textfont_size=16,
    textposition='inside',
    marker=dict(
        # color=list(colors), 
        line=dict(color='#000000', width=2)
    ),
)

fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="middle",
        y=-0.3,
        xanchor="center",
        x=0.2
    )
)

fig.show()
fig.write_image(
    f'../resources/generated/bar__SMOKGRP.png', 
    format='png', 
    width=900,
    height=450,
    scale=4
)

for column, sort, orientation in columns:
    barplot(df, column, sort, orientation)


In [9]:
boxplot(df, x='_AGE80', y='_BMI5CAT', caption='Age Distribution by BMI Category and Heart Attack History')

In [10]:
from pandas import Series
import plotly.graph_objects as go

colors = ['mediumturquoise', 'gold', '#12AACC', 'orange', '#007EB0']

df_bmi: Series = df['_BMI5CAT'].value_counts()
df_bmi = df_bmi.drop(labels='<NA>')

fig = go.Figure(data=[
    go.Pie(
        labels=df_bmi.keys(), 
        values=df_bmi.values
    )
])

fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="middle",
        y=-0.3,
        xanchor="center",
        x=0.5
    )
)

fig.update_traces(
    hoverinfo='label+percent', 
    textinfo='percent',
    textfont_size=20,
    textposition='outside',
    marker=dict(
        colors=colors, 
        line=dict(color='#000000', width=2)
    ),
)

fig.show()
