In [78]:
import duckdb as db
import pandas as pd
import numpy as np

In [2]:
path = 'sql_files/bn_beneficiario/'

# Análise dos dados

## Beneficiário

### Total Tabelas

In [33]:
db.sql(
"""     
    select 
        count(distinct table_name)
    from 'output/01_beneficiario.csv'
    --order by table_name
""")

┌────────────────────────────┐
│ count(DISTINCT table_name) │
│           int64            │
├────────────────────────────┤
│                         28 │
└────────────────────────────┘

### Join com campos da tabela no DW

In [162]:
df = db.sql(
"""
    with etl_origem as (
        select table_name, alias as table_alias, column_name, column_alias from 'output/01_beneficiario.csv' 
    ),
    etl_rules as (
        select 
            alias as column_name,
            rule as rule
        from 'output/01_beneficiario_rules.csv'        
    ),
    dw_estatistica as (
        select 
            COLUMN_NAME as column_name,
            NUM_DISTINCT as num_distinct,
            replace(trim(PERCENT_NULLS),',','.') as percent_nulls
        from 'output/BN_BENEFICIARIO.csv'        
    )
    select distinct
    orig.table_name,
    orig.column_alias as column_name,
    stat.column_name as dw_column_name,
    coalesce(rule.rule,'') as rule,
    stat.num_distinct,
    stat.percent_nulls
    from etl_origem as orig
    left join etl_rules as rule 
    on (orig.column_alias = rule.column_name)
    left join dw_estatistica stat
    on (orig.column_alias = stat.column_name)
    where stat.column_name not null
    order by orig.table_name, stat.percent_nulls
"""
).to_df()

In [163]:
df.to_csv('beneficiario.csv', index=False)

In [164]:
df[["table_name","column_name","dw_column_name","num_distinct","percent_nulls"]].to_markdown('beneficiario_table.md')

## sam_familia_teto_pf

### Total Tabelas

In [119]:
db.sql(
"""     
    select 
        count(distinct table_name)
    from 'output/02_sam_familia_teto_pf.csv'
    --order by table_name
""")

┌────────────────────────────┐
│ count(DISTINCT table_name) │
│           int64            │
├────────────────────────────┤
│                          1 │
└────────────────────────────┘

### Join com campos da tabela no DW

In [147]:
df = db.sql(
"""
    with etl_origem as (
        select table_name, alias as table_alias, column_name, column_alias from 'output/02_sam_familia_teto_pf.csv' 
    ),
    etl_rules as (
        select 
            alias as column_name,
            rule as rule
        from 'output/02_sam_familia_teto_pf_rules.csv'
    ),
    dw_estatistica as (
        select 
            COLUMN_NAME as column_name,
            NUM_DISTINCT as num_distinct,
            replace(trim(PERCENT_NULLS),',','.') as percent_nulls
        from 'output/BN_BENEFICIARIO.csv'        
    )
    select distinct
    orig.table_name,
    orig.column_alias as column_name,
    stat.column_name as dw_column_name,
    coalesce(rule.rule,'') as rule,
    stat.num_distinct,
    stat.percent_nulls
    from etl_origem as orig
    left join etl_rules as rule 
    on (orig.column_alias = rule.column_name)
    left join dw_estatistica stat
    on (orig.column_alias = stat.column_name)
    where stat.column_name not null
    order by orig.table_name, stat.percent_nulls
"""
).to_df()

In [149]:
df.to_csv('sam_familia_teto_pf_rules.csv', index=False)

In [150]:
df[["table_name","column_name","dw_column_name","num_distinct","percent_nulls"]].to_markdown('sam_familia_teto_pf_table.md')

## Busca Microsiga

### Total de Tabelas

In [154]:
## Trata-se de apenas uma tabela, modelo não separou corretamenta
db.sql(
"""     
    select 
        *
    from 'output/03_1_busca_microsiga.csv'
    --order by table_name
""")

┌────────────────┬─────────┬─────────────┬──────────────┐
│   table_name   │  alias  │ column_name │ column_alias │
│    varchar     │ varchar │   varchar   │   varchar    │
├────────────────┼─────────┼─────────────┼──────────────┤
│ SIGA.VW_SRA010 │ A       │ RA_TELEFON  │ TELEFONE     │
│ SIGA.VW_SRA010 │ A       │ RA_EMAIL    │ EMAIL        │
│ SIGA.VW_SRA010 │ A       │ RA_MAT      │ RA_MAT       │
│ SIGA.VW_SRA010 │ A       │ RA_CC       │ RA_CC        │
│ SIGA.CTT010    │ B       │ CTT_DESC01  │ SETOR_UNIMED │
│ SIGA.CTT010    │ B       │ CTT_CUSTO   │ CTT_CUSTO    │
└────────────────┴─────────┴─────────────┴──────────────┘

In [157]:
df = db.sql(
"""
    with etl_origem as (
        select table_name, alias as table_alias, column_name, column_alias from 'output/03_1_busca_microsiga.csv' 
    ),
    etl_rules as (
        select 
            alias as column_name,
            rule as rule
        from 'output/03_1_busca_microsiga_rules.csv'
    ),
    dw_estatistica as (
        select 
            COLUMN_NAME as column_name,
            NUM_DISTINCT as num_distinct,
            replace(trim(PERCENT_NULLS),',','.') as percent_nulls
        from 'output/BN_BENEFICIARIO.csv'        
    )
    select distinct
    orig.table_name,
    orig.column_alias as column_name,
    stat.column_name as dw_column_name,
    coalesce(rule.rule,'') as rule,
    stat.num_distinct,
    stat.percent_nulls
    from etl_origem as orig
    left join etl_rules as rule 
    on (orig.column_alias = rule.column_name)
    left join dw_estatistica stat
    on (orig.column_alias = stat.column_name)
    where stat.column_name not null
    order by orig.table_name, stat.percent_nulls
"""
).to_df()

In [158]:
df.to_csv('busca_microsiga.csv', index=False)

In [159]:
df[["table_name","column_name","dw_column_name","num_distinct","percent_nulls"]].to_markdown('busca_microsiga.md')

## Sem Setor

In [165]:
## Trata-se de apenas uma tabela, modelo não separou corretamenta
db.sql(
"""     
    select 
        *
    from 'output/03_2_sem_setor.csv'
    --order by table_name
""")

┌──────────────────┬─────────┬─────────────────────┬─────────────────────┐
│    table_name    │  alias  │     column_name     │    column_alias     │
│     varchar      │ varchar │       varchar       │       varchar       │
├──────────────────┼─────────┼─────────────────────┼─────────────────────┤
│ SAM_BENEFICIARIO │ BEN     │ HANDLE              │ HANDLE              │
│ SAM_BENEFICIARIO │ BEN     │ ENDERECORESIDENCIAL │ ENDERECORESIDENCIAL │
│ SAM_ENDERECO     │ ENDR    │ DDD1                │ DDD1                │
│ SAM_ENDERECO     │ ENDR    │ PREFIXO1            │ PREFIXO1            │
│ SAM_ENDERECO     │ ENDR    │ NUMERO1             │ NUMERO1             │
│ SAM_ENDERECO     │ ENDR    │ HANDLE              │ HANDLE              │
│ SAM_BENEFICIARIO │ NULL    │ SETOR_UNIMED        │ SETOR_UNIMED        │
└──────────────────┴─────────┴─────────────────────┴─────────────────────┘

In [167]:
df = db.sql(
"""
    with etl_origem as (
        select table_name, alias as table_alias, column_name, column_alias from 'output/03_2_sem_setor.csv' 
    ),
    etl_rules as (
        select 
            alias as column_name,
            rule as rule
        from 'output/03_2_sem_setor_rules.csv'
    ),
    dw_estatistica as (
        select 
            COLUMN_NAME as column_name,
            NUM_DISTINCT as num_distinct,
            replace(trim(PERCENT_NULLS),',','.') as percent_nulls
        from 'output/BN_BENEFICIARIO.csv'        
    )
    select distinct
    orig.table_name,
    orig.column_alias as column_name,
    stat.column_name as dw_column_name,
    coalesce(rule.rule,'') as rule,
    stat.num_distinct,
    stat.percent_nulls
    from etl_origem as orig
    left join etl_rules as rule 
    on (orig.column_alias = rule.column_name)
    left join dw_estatistica stat
    on (orig.column_alias = stat.column_name)
    where stat.column_name not null
    order by orig.table_name, stat.percent_nulls
"""
).to_df()

In [None]:
df.to_csv('busca_microsiga.csv', index=False)

In [168]:
df[["table_name","column_name","dw_column_name","num_distinct","percent_nulls"]].to_markdown('busca_microsiga.md')