# Descoberta de Subgrupos com dia_categoria - Violência Doméstica MG 2023

Este notebook aplica Subgroup Discovery (SD) considerando a nova variável `dia_categoria`, que agrupa os dias da semana em faixas significativas para análise temporal.

In [1]:
# Instalar a biblioteca se necessário
!pip install pysubgroup
import pandas as pd
import pysubgroup as ps


Collecting pysubgroup
  Downloading pysubgroup-0.8.0-py3-none-any.whl.metadata (11 kB)
Downloading pysubgroup-0.8.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.5/70.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pysubgroup
Successfully installed pysubgroup-0.8.0


In [6]:
# Carregar os dados
df = pd.read_csv("violencia_domestica_2023.csv", sep=";")

# Corrigir data
def corrigir_data(valor):
    try:
        return pd.to_datetime(valor)
    except:
        try:
            num = float(valor)
            return pd.to_datetime("1899-12-30") + pd.to_timedelta(num, unit="D")
        except:
            return pd.NaT

df['data_fato_corrigida'] = df['data_fato'].apply(corrigir_data)
df['dia_da_semana'] = df['data_fato_corrigida'].dt.day_name()

# Categorizar dia_tipo e dia_categoria
df['dia_tipo'] = df['dia_da_semana'].isin(['Saturday', 'Sunday']).map({True: 'fim de semana', False: 'dia útil'})

def categorizar_dia(dia):
    if dia in ['Monday', 'Tuesday', 'Wednesday']:
        return 'segunda a quarta'
    elif dia in ['Thursday', 'Friday', 'Saturday']:
        return 'quinta a sábado'
    elif dia == 'Sunday':
        return 'domingo'
    return 'desconhecido'

df['dia_categoria'] = df['dia_da_semana'].apply(categorizar_dia)


In [7]:
df['target'] = (df['tentado_consumado'] == 'CONSUMADO').astype(int)

atributos = [
    'municipio_fato', 'mes', 'risp', 'rmbh',
    'natureza_delito', 'dia_tipo', 'dia_categoria'
]

df_sd = df[atributos + ['target']].dropna()

In [8]:
# Criar tarefa de SD
target = ps.BinaryTarget('target', True)
search_space = ps.create_selectors(df_sd, ignore=['target'])
qf = ps.WRAccQF()

task = ps.SubgroupDiscoveryTask(
    df_sd, target, search_space, qf,
    result_set_size=10, depth=3
)

# Executar SD
result = ps.SimpleDFS().execute(task)

# Exibir resultados
result.to_dataframe()

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.00085,dia_tipo=='fim de semana' AND rmbh=='3) Interi...,17387,61536,17274,60951,44149,0.28255,0.71745,0.283408,0.716592,0.993501,0.989309,0.990493,1.003036
1,0.00084,natureza_delito=='LESAO CORPORAL' AND rmbh=='3...,8596,61536,8566,60951,52940,0.139691,0.860309,0.140539,0.859461,0.99651,0.989516,0.990493,1.006074
2,0.000796,natureza_delito=='LESAO CORPORAL',9780,61536,9736,60951,51756,0.158931,0.841069,0.159735,0.840265,0.995501,0.989547,0.990493,1.005056
3,0.000622,natureza_delito=='DESCUMPRIMENTO DE MEDIDA PRO...,4029,61536,4029,60951,57507,0.065474,0.934526,0.066102,0.933898,1.0,0.989827,0.990493,1.009598
4,0.000528,dia_tipo=='fim de semana',20876,61536,20710,60951,40660,0.339249,0.660751,0.339781,0.660219,0.992048,0.989695,0.990493,1.00157
5,0.000508,dia_tipo=='dia útil' AND natureza_delito=='LES...,4869,61536,4854,60951,56667,0.079124,0.920876,0.079638,0.920362,0.996919,0.989941,0.990493,1.006488
6,0.000489,natureza_delito=='DESCUMPRIMENTO DE MEDIDA PRO...,3163,61536,3163,60951,58373,0.051401,0.948599,0.051894,0.948106,1.0,0.989978,0.990493,1.009598
7,0.000478,dia_tipo=='dia útil' AND natureza_delito=='LES...,5616,61536,5592,60951,55920,0.091264,0.908736,0.091746,0.908254,0.995726,0.989968,0.990493,1.005283
8,0.00047,rmbh=='3) Interior de MG',50484,61536,50033,60951,11052,0.820398,0.179602,0.820873,0.179127,0.991066,0.987875,0.990493,1.000579
9,0.000442,dia_categoria=='domingo' AND rmbh=='3) Interio...,9383,61536,9321,60951,52153,0.15248,0.84752,0.152926,0.847074,0.993392,0.989972,0.990493,1.002927


In [None]:
df_result = result.to_dataframe()
df_result = df_result.set_index("subgroup")
display(df_result.sort_values('lift', ascending=False).head(10))

Unnamed: 0_level_0,quality,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
subgroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
natureza_delito=='DESCUMPRIMENTO DE MEDIDA PROTETIVA DE URGENCIA' AND rmbh=='3) Interior de MG',0.000489,3163,61536,3163,60951,58373,0.051401,0.948599,0.051894,0.948106,1.0,0.989978,0.990493,1.009598
natureza_delito=='DESCUMPRIMENTO DE MEDIDA PROTETIVA DE URGENCIA',0.000622,4029,61536,4029,60951,57507,0.065474,0.934526,0.066102,0.933898,1.0,0.989827,0.990493,1.009598
dia_tipo=='dia útil' AND natureza_delito=='LESAO CORPORAL' AND rmbh=='3) Interior de MG',0.000508,4869,61536,4854,60951,56667,0.079124,0.920876,0.079638,0.920362,0.996919,0.989941,0.990493,1.006488
natureza_delito=='LESAO CORPORAL' AND rmbh=='3) Interior de MG',0.00084,8596,61536,8566,60951,52940,0.139691,0.860309,0.140539,0.859461,0.99651,0.989516,0.990493,1.006074
dia_tipo=='dia útil' AND natureza_delito=='LESAO CORPORAL',0.000478,5616,61536,5592,60951,55920,0.091264,0.908736,0.091746,0.908254,0.995726,0.989968,0.990493,1.005283
natureza_delito=='LESAO CORPORAL',0.000796,9780,61536,9736,60951,51756,0.158931,0.841069,0.159735,0.840265,0.995501,0.989547,0.990493,1.005056
dia_tipo=='fim de semana' AND rmbh=='3) Interior de MG',0.00085,17387,61536,17274,60951,44149,0.28255,0.71745,0.283408,0.716592,0.993501,0.989309,0.990493,1.003036
dia_categoria=='domingo' AND rmbh=='3) Interior de MG',0.000442,9383,61536,9321,60951,52153,0.15248,0.84752,0.152926,0.847074,0.993392,0.989972,0.990493,1.002927
dia_tipo=='fim de semana',0.000528,20876,61536,20710,60951,40660,0.339249,0.660751,0.339781,0.660219,0.992048,0.989695,0.990493,1.00157
rmbh=='3) Interior de MG',0.00047,50484,61536,50033,60951,11052,0.820398,0.179602,0.820873,0.179127,0.991066,0.987875,0.990493,1.000579
