# <font color='gray'> Bibliotecas necessárias
---

In [1]:
import requests
from io import BytesIO
import polars as pl
import pandas as pd

# <font color='gray'> Carregando os dados
----

Este projeto utiliza dados do Sistema de Informações sobre Mortalidade (SIM) do [SIM - DataSUS](https://opendatasus.saude.gov.br/pt_BR/dataset/sim), disponível no OpenDataSUS. O SIM é a principal base de dados de óbitos no Brasil e reúne informações detalhadas sobre mortes registradas no país, incluindo causa básica do óbito, características da vítima e circunstâncias da morte.

Iniciamos as análises com dados de 2023.

In [2]:
# URL do arquivo CSV
url = "https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SIM/DO23OPEN.csv"

# Faz o download do arquivo
response = requests.get(url)
response.raise_for_status()  # Garante que a requisição foi bem-sucedida

# Carrega o CSV diretamente para o Polars
df = pl.read_csv(
    BytesIO(response.content),
    separator=";",  # Define o delimitador correto
    quote_char='"',  # Considera aspas como parte do valor
    encoding="utf8",
    ignore_errors=True  # Ignora linhas problemáticas (opcional)
)
print(f"Linhas: {df.height}, Colunas: {df.width}")


Linhas: 1465610, Colunas: 86


In [3]:
# dados 
df

contador,ORIGEM,TIPOBITO,DTOBITO,HORAOBITO,NATURAL,CODMUNNATU,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,ESC2010,SERIESCFAL,OCUP,CODMUNRES,LOCOCOR,CODESTAB,CODMUNOCOR,IDADEMAE,ESCMAE,ESCMAE2010,SERIESCMAE,OCUPMAE,QTDFILVIVO,QTDFILMORT,GRAVIDEZ,SEMAGESTAC,GESTACAO,PARTO,OBITOPARTO,PESO,TPMORTEOCO,OBITOGRAV,OBITOPUERP,ASSISTMED,…,CIRCOBITO,ACIDTRAB,FONTE,NUMEROLOTE,DTINVESTIG,DTCADASTRO,ATESTANTE,STCODIFICA,CODIFICADO,VERSAOSIST,VERSAOSCB,FONTEINV,DTRECEBIM,ATESTADO,DTRECORIGA,OPOR_DO,CAUSAMAT,ESCMAEAGR1,ESCFALAGR1,STDOEPIDEM,STDONOVA,DIFDATA,NUDIASOBCO,DTCADINV,TPOBITOCOR,DTCONINV,FONTES,TPRESGINFO,TPNIVELINV,DTCADINF,MORTEPARTO,DTCONCASO,ALTCAUSA,CAUSABAS_O,TPPOS,TP_ALTERA,CB_ALT
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,i64,i64,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,str,str,str,f64,i64,i64,str,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,i64,i64,i64,i64,str,str,str,str
1,1,2,14022023,930,833,330270,10071954,468,1,4,1,2,1,,715210,330190,3,,330190,,,,,,,,,,,,,,,,,9,…,,,,20230018,,23022023,5,"""S""","""S""","""3.2.30""",3.4,,27022023,"""R99/I10""",27022023,13,,,10,0,1,13,,,,,,,,,,,,"""I10""",,,
2,1,2,14022023,1730,822,221110,12091956,466,1,4,2,4,3,,21210,221100,1,2323281,221100,,,,,,,,,,,,,,,,,1,…,,,,20230028,,17022023,5,"""S""","""S""","""3.2.30""",3.4,,17022023,"""R578/I619/I678""",17022023,3,,,12,0,1,3,,,,,,,,,,,,"""I678""",,,
3,1,2,14022023,500,822,220790,10011953,470,2,,3,9,9,,999992,220790,1,2726971,221100,,,,,,,,,,,,,,,,,1,…,,,,20230028,,17022023,5,"""S""","""S""","""3.2.30""",3.4,,17022023,"""A419/K750*E149 I10""",17022023,3,,,9,0,1,3,,,,,,,,,,,,"""K750""",,,
4,1,2,14022023,2235,823,231180,15121943,479,2,1,2,4,3,,763010,230440,1,2497654,230440,,,,,,,,,,,,,,,,,1,…,,,,20230007,23022023,15022023,1,"""S""","""S""","""3.2.30""",,3,9032023,"""I611*J189""",2032023,16,,,12,0,1,23,,,,,,,,,,,,"""I64""","""S""",,
5,1,2,14022023,940,841,411350,9041973,449,2,4,1,1,0,,999992,510515,2,7050577,510515,,,,,,,,,,,,,,8,2,3,1,…,,,,20240057,14022023,14022023,1,"""S""","""S""","""3.2.30""",3.4,3,30102024,"""J960/C189""",16022023,2,,,0,0,1,624,3,17022023,9,17022023,,,"""M""",,,,,"""C189""","""S""",,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1465606,1,2,30112023,2100,816,160050,1042023,307,1,5,,,,,,160050,6,,160050,"""29""",3,1,"""4""","""999992""",1,0,1,,,,3,,,,,2,…,,,,,,8122023,1,"""N""","""N""","""2...0""",,,,"""R99""",8122023,8,,2,,0,1,8,,,,,"""SXXSXX""",,,10062024,3,26052024,2,,,,
1465607,1,2,12122023,1220,815,150060,23101987,436,1,5,,3,2,,,150060,3,,150060,,,,,,,,,,,,,,,,,2,…,,,,,27032024,3012024,,"""N""","""N""","""2...0""",,8,,"""R98""",3012024,22,,,11,0,1,22,,,,,,,,,,,,,,,
1465608,1,2,22122023,1734,815,150060,5112023,301,1,5,,,,,,150060,6,,150060,"""25""",1,0,,"""612005""",1,0,1,40,5,1,3,3018,,,,2,…,,,,,15042024,3012024,,"""N""","""N""","""2...0""",,8,,"""R98""",3012024,12,,0,,0,1,12,,,,,"""XXXXXX""",,,15042024,3,20032024,1,,,,
1465609,1,2,22122023,2030,813,130140,1012008,415,2,5,,1,0,,,130140,6,,130140,,,,,,,,,,,,,,9,2,3,2,…,2,,3,,2022024,17062024,5,"""N""","""N""","""2...0""",,8,,"""X709""",17062024,178,,,0,0,1,178,94,21062024,9,25032024,,,"""E""",,,,,,,,


# <font color='gray'> Análise Exploratória dos Dados
---

In [4]:
# Calculando a porcentagem de nulos por coluna
perc_nulos = (df.null_count() / df.height) * 100

# Transpondo as colunas para linhas (usando melt para reestruturar)
perc_nulos_transposto = perc_nulos.melt()

# Exibindo as colunas que contém dados nulos a partir de 60%
lista_col = perc_nulos_transposto.filter(pl.col('value') > 60)["variable"].to_list()
lista_col

  perc_nulos_transposto = perc_nulos.melt()


['SERIESCFAL',
 'IDADEMAE',
 'ESCMAE',
 'ESCMAE2010',
 'SERIESCMAE',
 'OCUPMAE',
 'QTDFILVIVO',
 'QTDFILMORT',
 'GRAVIDEZ',
 'SEMAGESTAC',
 'GESTACAO',
 'PARTO',
 'OBITOPARTO',
 'PESO',
 'TPMORTEOCO',
 'OBITOGRAV',
 'OBITOPUERP',
 'EXAME',
 'CIRURGIA',
 'LINHAD',
 'CB_PRE',
 'COMUNSVOIM',
 'CIRCOBITO',
 'ACIDTRAB',
 'FONTE',
 'DTINVESTIG',
 'FONTEINV',
 'CAUSAMAT',
 'ESCMAEAGR1',
 'NUDIASOBCO',
 'DTCADINV',
 'TPOBITOCOR',
 'DTCONINV',
 'FONTES',
 'TPRESGINFO',
 'TPNIVELINV',
 'DTCADINF',
 'MORTEPARTO',
 'DTCONCASO',
 'ALTCAUSA',
 'TP_ALTERA',
 'CB_ALT']

In [5]:
# removendo as colunas que têm muitos dados nulos
df = df.drop(lista_col)
df

contador,ORIGEM,TIPOBITO,DTOBITO,HORAOBITO,NATURAL,CODMUNNATU,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,ESC2010,OCUP,CODMUNRES,LOCOCOR,CODESTAB,CODMUNOCOR,ASSISTMED,NECROPSIA,LINHAA,LINHAB,LINHAC,LINHAII,CAUSABAS,DTATESTADO,NUMEROLOTE,DTCADASTRO,ATESTANTE,STCODIFICA,CODIFICADO,VERSAOSIST,VERSAOSCB,DTRECEBIM,ATESTADO,DTRECORIGA,OPOR_DO,ESCFALAGR1,STDOEPIDEM,STDONOVA,DIFDATA,CAUSABAS_O,TPPOS
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,i64,i64,i64,i64,str,str,str,f64,i64,str,i64,i64,i64,i64,i64,i64,str,str
1,1,2,14022023,930,833,330270,10071954,468,1,4,1,2,1,715210,330190,3,,330190,9,2,"""*R99X""","""*I10X""",,,"""I10""",14022023,20230018,23022023,5,"""S""","""S""","""3.2.30""",3.4,27022023,"""R99/I10""",27022023,13,10,0,1,13,"""I10""",
2,1,2,14022023,1730,822,221110,12091956,466,1,4,2,4,3,21210,221100,1,2323281,221100,1,2,"""*R578""","""*I619""","""*I678""",,"""I678""",14022023,20230028,17022023,5,"""S""","""S""","""3.2.30""",3.4,17022023,"""R578/I619/I678""",17022023,3,12,0,1,3,"""I678""",
3,1,2,14022023,500,822,220790,10011953,470,2,,3,9,9,999992,220790,1,2726971,221100,1,2,"""*A419""","""*K750""",,"""*E149*I10X""","""K750""",14022023,20230028,17022023,5,"""S""","""S""","""3.2.30""",3.4,17022023,"""A419/K750*E149 I10""",17022023,3,9,0,1,3,"""K750""",
4,1,2,14022023,2235,823,231180,15121943,479,2,1,2,4,3,763010,230440,1,2497654,230440,1,2,"""*I611""",,,"""*J189""","""I611""",15022023,20230007,15022023,1,"""S""","""S""","""3.2.30""",,9032023,"""I611*J189""",2032023,16,12,0,1,23,"""I64""","""S"""
5,1,2,14022023,940,841,411350,9041973,449,2,4,1,1,0,999992,510515,2,7050577,510515,1,2,"""*J960""","""*C189""",,,"""C189""",14022023,20240057,14022023,1,"""S""","""S""","""3.2.30""",3.4,30102024,"""J960/C189""",16022023,2,0,0,1,624,"""C189""","""S"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1465606,1,2,30112023,2100,816,160050,1042023,307,1,5,,,,,160050,6,,160050,2,2,"""*R99X""",,,,"""R99""",1122023,,8122023,1,"""N""","""N""","""2...0""",,,"""R99""",8122023,8,,0,1,8,,
1465607,1,2,12122023,1220,815,150060,23101987,436,1,5,,3,2,,150060,3,,150060,2,2,"""*R98X""",,,,"""R98""",,,3012024,,"""N""","""N""","""2...0""",,,"""R98""",3012024,22,11,0,1,22,,
1465608,1,2,22122023,1734,815,150060,5112023,301,1,5,,,,,150060,6,,150060,2,2,"""*R98X""",,,,"""R98""",,,3012024,,"""N""","""N""","""2...0""",,,"""R98""",3012024,12,,0,1,12,,
1465609,1,2,22122023,2030,813,130140,1012008,415,2,5,,1,0,,130140,6,,130140,2,9,"""*X709""",,,,"""X709""",28022024,,17062024,5,"""N""","""N""","""2...0""",,,"""X709""",17062024,178,0,0,1,178,,


## <font color='gray'> Códigos CID

Nesta etapa, vamaos selecionar apenas os CIDs relacionados a agressão.

http://www2.datasus.gov.br/cid10/V2008/WebHelp/v01_y98.htm

In [6]:
# Relação de CIDs
codigos = [
    f'X{i}' for i in range(85, 100)
] + [
    f'Y0{i}' for i in range(0, 10)
]
codigos = set(codigos)
codigos

{'X85',
 'X86',
 'X87',
 'X88',
 'X89',
 'X90',
 'X91',
 'X92',
 'X93',
 'X94',
 'X95',
 'X96',
 'X97',
 'X98',
 'X99',
 'Y00',
 'Y01',
 'Y02',
 'Y03',
 'Y04',
 'Y05',
 'Y06',
 'Y07',
 'Y08',
 'Y09'}

In [7]:
# Colunas que possuem CIDs 
cols_cid = [
    'LINHAA',
    'LINHAB',
    'LINHAC',
    'LINHAII',
    'CAUSABAS',
    'ATESTADO',
    'CAUSABAS_O',
]

In [8]:
# Aqui vamos selecionar apenas as linhas que contém CIDs relacionados a lista que criamos
# Criar uma condição combinada com OR (|) para cada código em cada coluna
condicoes = []
for col in cols_cid:
    for cod in codigos:
        condicoes.append(pl.col(col).is_not_null() & pl.col(col).cast(pl.Utf8).str.contains(cod))

# Combinar todas as condições com OR
condicao_final = pl.any_horizontal(*condicoes)

# Aplicar o filtro
df_trat = df.filter(condicao_final)
df_trat

contador,ORIGEM,TIPOBITO,DTOBITO,HORAOBITO,NATURAL,CODMUNNATU,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,ESC2010,OCUP,CODMUNRES,LOCOCOR,CODESTAB,CODMUNOCOR,ASSISTMED,NECROPSIA,LINHAA,LINHAB,LINHAC,LINHAII,CAUSABAS,DTATESTADO,NUMEROLOTE,DTCADASTRO,ATESTANTE,STCODIFICA,CODIFICADO,VERSAOSIST,VERSAOSCB,DTRECEBIM,ATESTADO,DTRECORIGA,OPOR_DO,ESCFALAGR1,STDOEPIDEM,STDONOVA,DIFDATA,CAUSABAS_O,TPPOS
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,i64,i64,i64,i64,str,str,str,f64,i64,str,i64,i64,i64,i64,i64,i64,str,str
140,1,2,1012023,2123,823,230110,4081984,438,1,4,1,3,1,991315,230110,5,,230110,1,1,"""*T794""","""*X954""",,,"""X954""",2012023,20230001,5012023,3,"""S""","""S""","""3.2.30""",3.4,12012023,"""T794/X954""",12012023,11,2,0,1,11,"""X954""",
173,1,2,1012023,1250,833,330170,2121957,465,2,4,1,3,2,512105,330455,1,2280183,330455,1,2,"""*T792""","""*S273*S269""","""*X959""",,"""X959""",3012023,20240094,11012023,3,"""S""","""S""","""3.2.30""",3.4,2072024,"""T792/S273 S269/X959""",16012023,15,11,0,1,548,"""Y349""","""S"""
174,1,2,1012023,,843,431680,14061993,429,1,1,1,4,2,,430420,4,,430420,2,1,"""*T145""","""*S368""",,,"""X954""",1012023,20230001,5012023,3,"""S""","""S""","""3.2.30""",3.4,5012023,"""T145/S368/ /X954""",5012023,4,4,0,1,4,"""X954""","""N"""
239,1,2,1012023,322,835,354390,14031980,442,1,1,2,3,2,724315,354390,4,,354390,2,2,"""*T794""","""*X990""",,,"""X990""",1012023,20230005,9012023,3,"""S""","""S""","""3.2.30""",3.4,11012023,"""T794/X990""",11012023,10,3,0,1,10,"""X990""","""N"""
286,1,2,1012023,,829,291080,4061996,426,1,4,1,4,3,,330455,5,,330455,2,1,"""*X958""","""*S318*S219*T792""",,,"""X958""",2012023,20230007,11012023,3,"""S""","""S""","""3.2.30""",3.4,16012023,"""X958/S318 S219 T792""",16012023,15,12,0,1,15,"""X958""","""N"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1465467,2,2,10122023,1945,829,291840,8111978,445,1,4,1,1,0,621005,293077,5,,293077,2,1,"""*T792""","""*S069""","""*X959""",,"""X959""",11122023,20230047,14122023,3,"""S""","""S""","""3.2.30""",3.4,,"""T792/S069/X959""",14122023,4,0,0,1,4,,
1465475,2,2,11122023,11,826,260300,20021995,428,1,4,9,2,1,715210,290720,5,,290720,2,1,"""*T792""","""*S273*S269""","""*S361""",,"""X959""",11122023,20230047,14122023,3,"""S""","""S""","""3.2.30""",3.4,,"""T792/S273 S269/S361/X959""",14122023,3,10,0,1,3,,
1465478,2,2,11122023,116,826,261400,20121981,441,1,4,1,3,2,621005,290720,1,2388928,290720,1,1,"""*S068""","""*S069""","""*X959""",,"""X959""",11122023,20230047,14122023,3,"""S""","""S""","""3.2.30""",3.4,,"""S068/S069/X959""",14122023,3,11,0,1,3,,
1465479,2,2,11122023,11,829,291840,27081993,430,1,4,1,3,2,621005,290720,5,,290720,2,1,"""*S068""","""*S069""","""*X959""",,"""X959""",11122023,20230047,14122023,3,"""S""","""S""","""3.2.30""",3.4,,"""S068/S069/X959""",14122023,3,11,0,1,3,,


In [9]:
# distribuicao por sexo
df_trat['SEXO'].value_counts()

SEXO,count
i64,u32
2,3945
0,97
1,40045
