# Descarga, cargue, analisis y limpieza de datos

## Importar librerias

In [1]:
import janitor  #limpieza de datos
import nhanes.load  #importa base de datos encuesta de salud
import numpy as np  #calculo numerico con df
import pandas as pd #manejo de df
import missingno    #exploracion de missings

  @_expand_grid.register(pd.arrays.PandasArray)


## Importar funciones personalizadas

In [2]:
%run pandas-missing-extension.ipynb

## Cargar los datos de NHANES

In [24]:
nhanes_raw_df = (   #guardaremos base datos en nhanes_df
    nhanes.load.load_NHANES_data(year = "2017-2018")    #cargamos df entre los años 2017-2018
    .clean_names(case_type = "snake")   #adecuamos los nombres en minuscula separados por _
        
)

nhanes_raw_df

Unnamed: 0_level_0,general_health_condition,ever_breastfed_or_fed_breastmilk,age_stopped_breastfeedingdays,age_first_fed_formuladays,age_stopped_receiving_formuladays,age_started_other_foodbeverage,age_first_fed_milkdays,type_of_milk_first_fed_whole_milk,type_of_milk_first_fed2_milk,type_of_milk_first_fed1_milk,...,days_smoked_cigs_during_past30_days,avg_cigarettesday_during_past30_days,tried_to_quit_smoking,times_stopped_smoking_cigarettes,how_long_were_you_able_to_stop_smoking,unit_of_measure_dayweekmonthyear_2_smq,current_selfreported_height_inches,current_selfreported_weight_pounds,tried_to_lose_weight_in_past_year,times_lost10_lbs_or_more_to_lose_weight
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93703.0,,1.0,273.0,1.0,365.0,152.0,365.0,Whole or regular milk,,,...,,,,,,,,,,
93704.0,,1.0,60.0,3.0,365.0,126.0,365.0,Whole or regular milk,,,...,,,,,,,,,,
93705.0,Good,,,,,,,,,,...,,,,,,,63.0,165.0,0.0,11 times or more
93706.0,Very good,,,,,,,,,,...,,,,,,,68.0,145.0,0.0,Never
93707.0,Good,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102952.0,Very good,,,,,,,,,,...,,,,,,,,117.0,0.0,Never
102953.0,Fair or,,,,,,,,,,...,,,,,,,65.0,218.0,0.0,Never
102954.0,Good,,,,,,,,,,...,,,,,,,66.0,150.0,0.0,Never
102955.0,Very good,,,,,,,,,,...,,,,,,,,,,


## Procesar los datos de NHANES

In [28]:
nhanes_df = (   #solo seleccionamos ocho columnas para procesar
    nhanes_raw_df
    .select_columns(
        "general_health_condition",
        "age_in_years_at_screening",
        "gender",
        "current_selfreported_height_inches",
        "current_selfreported_weight_pounds",
        "doctor_told_you_have_diabetes",
        "60_sec_pulse30_sec_pulse2",
        "total_cholesterol_mgdl"
    )
    .rename_columns(    #renombramos cinco variables a nombres cortos
        {
            "age_in_years_at_screening" : "age",
            "current_selfreported_height_inches" : "height",
            "current_selfreported_weight_pounds" : "pounds",
            "doctor_told_you_have_diabetes" : "diabetes",
            "60_sec_pulse30_sec_pulse2" : "pulse",
        }
    )
    .replace(
        {   #reemplazamos en variable height los valores 9999 y 7777 por nan (missings)
            "height" : {
                9999 : np.nan,
                7777 : np.nan
            },
            #reemplazamos en variable weight los valores 9999 y 7777 por nan (missings)
            "weight" : {
                9999 : np.nan,
                7777 : np.nan
            },
            #reemplazamos en variable diabetes el valor Borderline por nan (missings)
            "diabetes" : {
                "Borderline" : np.nan
            }
        }
    )
    .missing.sort_variables_by_missingness()    #ordenamos df por variable/columnas con mas missings
    .dropna(    #eliminamos observaciones/filas donde la variable diabetes tenga valores na, missings...
        subset = ["diabetes"],  #...por ser variable con pocos missings
        how = "any"
    )
    .transform_column(  #transformamos variable diabetes a entero/numero porque aparece  tipo objeto
        column_name = "diabetes",
        function = lambda s: s.astype(int),
        elementwise = False #para que regres una serie de datos
    )
)

nhanes_df

Unnamed: 0_level_0,height,pounds,general_health_condition,total_cholesterol_mgdl,pulse,diabetes,age,gender
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
93703.0,,,,,,0,2.0,Female
93704.0,,,,,,0,2.0,Male
93705.0,63.0,165.0,Good,157.0,52.0,0,66.0,Female
93706.0,68.0,145.0,Very good,148.0,82.0,0,18.0,Male
93707.0,,,Good,189.0,100.0,0,13.0,Male
...,...,...,...,...,...,...,...,...
102951.0,,,,,,0,4.0,Male
102953.0,65.0,218.0,Fair or,182.0,78.0,0,42.0,Male
102954.0,66.0,150.0,Good,172.0,78.0,0,41.0,Female
102955.0,,,Very good,150.0,74.0,0,14.0,Female
