In [6]:
import os
import pandas as pd
import numpy as np
import dask.dataframe as dd
from tabulate import tabulate

In [25]:
df = pd.read_parquet("raw_data")

In [26]:
df.shape

(326910, 113)

#### Colunas com valores constantes em todos os registros.

In [67]:
same_values = [
    column for column in df.columns 
    if len(df[column].unique()) <= 1
    ]

In [None]:
print(tabulate([[column, list(df[column].unique())] for column in same_values],
               headers=["Campo", "Valor Constante"], tablefmt="pretty"))

+------------+-------------------+
|   Campo    |  Valor Contante   |
+------------+-------------------+
| UTI_MES_IN |      [' 0']       |
| UTI_MES_AN |      [' 0']       |
| UTI_MES_AL |      [' 0']       |
| UTI_INT_IN |      [' 0']       |
| UTI_INT_AN |      [' 0']       |
| UTI_INT_AL |      [' 0']       |
|  VAL_SADT  | ['         0.00'] |
|   VAL_RN   | ['         0.00'] |
| VAL_ACOMP  | ['         0.00'] |
|  VAL_ORTP  | ['         0.00'] |
| VAL_SANGUE | ['         0.00'] |
| VAL_SADTSR |  ['       0.00']  |
| VAL_TRANSP | ['         0.00'] |
| VAL_OBSANG |  ['       0.00']  |
| VAL_PED1AC |  ['       0.00']  |
| DIAG_SECUN |     ['0000']      |
|  NATUREZA  |      ['00']       |
|  RUBRICA   |     ['    0']     |
|  NUM_PROC  |       ['']        |
| TOT_PT_SP  |    ['     0']     |
|  CPF_AUT   |       ['']        |
|  SEQ_AIH5  |      ['000']      |
|    CBOR    |    ['000000']     |
|   CNAER    |      ['000']      |
|  VINCPREV  |       ['0']       |
| GESTOR_DT  |      

#### Campos com prevalência de valores supeior a 95%.


In [100]:
prevalent_values=[
    column for column in df.columns
    if column != 'COD_IDADE' and column !="NACIONAL"
    and column not in same_values
    and ((df[column].value_counts(normalize=True).iloc[0]) * 100) > 95
    ]

In [101]:
print(tabulate(
    [[column,
    round((df[column].value_counts(normalize=True).iloc[0]) * 100, 4),
    df[column].value_counts().idxmax()]
    for column in prevalent_values],
    headers=["Campo", "% Prevalente", "Valor Prevalente"],
    tablefmt="pretty"
    ))

+------------+--------------+------------------+
|   Campo    | % Prevalente | Valor Prevalente |
+------------+--------------+------------------+
| UTI_MES_TO |   99.9043    |        0         |
| MARCA_UTI  |   99.9043    |        00        |
| UTI_INT_TO |   99.9994    |        0         |
| DIAR_ACOM  |   96.1405    |        0         |
|  VAL_UTI   |   99.9043    |       0.00       |
|  IND_VDRL  |   99.9419    |        0         |
|   MORTE    |   99.8498    |        0         |
|  HOMONIMO  |   99.9893    |        0         |
| NUM_FILHOS |   99.9997    |        0         |
|   INSTRU   |   99.9997    |        0         |
| CID_NOTIF  |   99.9997    |                  |
| CONTRACEP1 |   99.9997    |        00        |
| CONTRACEP2 |   99.9997    |        00        |
| GESTRISCO  |   99.9997    |        1         |
|  INSC_PN   |   99.9985    |   000000000000   |
| GESTOR_COD |   99.2392    |      00000       |
|  COMPLEX   |   99.9979    |        02        |
|   FINANC   |   99.

In [107]:
df[[column for column in df.columns if column not in (same_values + prevalent_values)]]

Unnamed: 0,UF_ZI,ANO_CMPT,MES_CMPT,ESPEC,CGC_HOSP,N_AIH,IDENT,CEP,MUNIC_RES,NASC,SEXO,QT_DIARIAS,PROC_SOLIC,PROC_REA,VAL_SH,VAL_SP,VAL_TOT,US_TOT,DT_INTER,DT_SAIDA,DIAG_PRINC,COBRANCA,NAT_JUR,GESTAO,MUNIC_MOV,COD_IDADE,IDADE,DIAS_PERM,NACIONAL,CAR_INT,GESTOR_TP,GESTOR_CPF,CNES,CNPJ_MANT,REGCT,RACA_COR,SEQUENCIA,REMESSA,DIAGSEC1,TPDISEC1
4,350000,2019,01,05,46374500013504,3519101604414,1,08570390,352310,19971208,1,2,0303170140,0303170140,99.56,14.44,114.00,30.56,20190116,20190118,F192,12,1023,2,352310,4,21,2,010,02,0,000000000000000,2078562,46374500000194,7102,03,62344,HE35000001N201901.DTS,,0
5,350000,2019,01,05,46374500013504,3519101604733,1,08576500,352310,19590826,1,14,0303170140,0303170140,794.36,101.08,895.44,240.06,20190117,20190131,F200,12,1023,2,352310,4,59,14,010,02,0,000000000000000,2078562,46374500000194,7102,03,62345,HE35000001N201901.DTS,F059,1
6,350000,2019,01,05,46374500013504,3519101604766,1,08576510,352310,19910417,1,7,0303170140,0303170140,348.46,50.54,399.00,106.97,20190117,20190124,F192,12,1023,2,352310,4,27,7,010,02,0,000000000000000,2078562,46374500000194,7102,02,62346,HE35000001N201901.DTS,F209,1
7,350000,2019,01,05,46374500013504,3519101604887,1,08595850,352310,19620707,3,9,0303170140,0303170140,545.46,64.98,610.44,163.65,20190117,20190126,F312,12,1023,2,352310,4,56,9,010,02,0,000000000000000,2078562,46374500000194,7102,03,62347,HE35000001N201901.DTS,F200,1
8,350000,2019,01,05,46374500013504,3519101604953,1,08595500,352310,19820905,1,13,0303170140,0303170140,647.14,93.86,741.00,198.65,20190117,20190130,F312,12,1023,2,352310,4,36,13,010,02,0,000000000000000,2078562,46374500000194,7102,02,62348,HE35000001N201901.DTS,F192,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224309,354870,2024,12,05,,3524132785239,1,09720375,354870,19770428,1,5,0303170140,0303170140,288.90,36.10,325.00,55.84,20241217,20241221,F29,31,1244,1,354870,4,47,4,010,02,0,000000000000000,2069776,46523239000147,0000,03,1686,HM35487001N202412.DTS,,0
224414,354870,2024,12,03,,3524132782731,1,09720375,354870,19930914,1,4,0303170140,0303170140,199.12,28.88,228.00,39.17,20241211,20241214,F208,31,1244,1,354870,4,31,3,010,02,0,000000000000000,2069776,46523239000147,0000,03,1220,HM35487001N202412.DTS,,0
224769,354870,2024,12,03,,3524132782863,1,09761243,354870,19990806,1,12,0303170140,0303170140,597.36,86.64,684.00,117.52,20241206,20241217,F195,31,1244,1,354870,4,25,11,010,02,0,000000000000000,2069776,46523239000147,0000,03,1230,HM35487001N202412.DTS,,0
224773,354870,2024,12,03,,3524132784469,1,09862060,354870,19650927,3,7,0303170140,0303170140,429.66,50.54,480.20,82.50,20241212,20241218,F29,31,1244,1,354870,4,59,6,010,02,0,000000000000000,2069776,46523239000147,0000,01,1339,HM35487001N202412.DTS,,0


#### Testar agrupador

In [108]:
df.groupby('N_AIH').size().sort_values(ascending=False)

N_AIH
3514108696670    73
3513113087583    72
3514119890457    72
3517124637488    72
3514119891250    72
                 ..
3521108660498     1
3521108660510     1
3521108660531     1
3521108660542     1
3524502350160     1
Length: 224538, dtype: int64

In [110]:
df[prevalent_values].info()

<class 'pandas.core.frame.DataFrame'>
Index: 326910 entries, 4 to 224794
Data columns (total 42 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   UTI_MES_TO  326910 non-null  string
 1   MARCA_UTI   326910 non-null  string
 2   UTI_INT_TO  326910 non-null  string
 3   DIAR_ACOM   326910 non-null  string
 4   VAL_UTI     326910 non-null  string
 5   IND_VDRL    326910 non-null  string
 6   MORTE       326910 non-null  string
 7   HOMONIMO    326910 non-null  string
 8   NUM_FILHOS  326910 non-null  string
 9   INSTRU      326910 non-null  string
 10  CID_NOTIF   326910 non-null  string
 11  CONTRACEP1  326910 non-null  string
 12  CONTRACEP2  326910 non-null  string
 13  GESTRISCO   326910 non-null  string
 14  INSC_PN     326910 non-null  string
 15  GESTOR_COD  326910 non-null  string
 16  COMPLEX     326910 non-null  string
 17  FINANC      326910 non-null  string
 18  FAEC_TP     326910 non-null  string
 19  ETNIA       326910 non-null 