In [1]:
import pandas as pd

In [2]:
df15 = pd.read_csv('dataset/2015.csv', encoding='cp1252', sep=';')
df16 = pd.read_csv('dataset/2016.csv', encoding='cp1252', sep=';')
df17 = pd.read_csv('dataset/2017.csv', encoding='cp1252', sep=';')
df18 = pd.read_csv('dataset/2018.csv', encoding='cp1252', sep=';')
df19 = pd.read_csv('dataset/2019.csv', encoding='cp1252', sep=';')
df20 = pd.read_csv('dataset/2020.csv', encoding='cp1252', sep=';')
df21 = pd.read_csv('dataset/2021.csv', encoding='cp1252', sep=';')
df22 = pd.read_csv('dataset/2022.csv', encoding='cp1252', sep=';')
df23 = pd.read_csv('dataset/2023.csv', encoding='cp1252', sep=';')
df24 = pd.read_csv('dataset/2024.csv', encoding='cp1252', sep=';')

In [9]:
df = pd.concat([df15, df16, df17, df18, df19, df20, df21, df22, df23, df24])

In [10]:
desastres = list(df['COBRADE'].unique())

In [11]:
id_full_disasters = {}
for d in desastres:
    cod = d[:5]
    id_full_disasters[cod] = d[8:]

In [12]:
keep_cols = [
    'UF', 'Municï¿½pio', 'Registro', 'COBRADE',
    'Status', 'Populaï¿½ï¿½o', 'DH_Mortos', 'DH_Feridos', 'DH_Enfermos',
    'DH_Desabrigados', 'DH_Desalojados'
]
df = df[keep_cols]
df['DH_Feridos_Enfermos'] = df['DH_Feridos'] + df['DH_Enfermos']
df['DH_Desabrigados_Desalojados'] = df['DH_Desabrigados'] + df['DH_Desalojados']
df = df.drop(['DH_Feridos', 'DH_Enfermos', 'DH_Desabrigados', 'DH_Desalojados'], axis=1)

In [13]:
df = df.rename(columns={'Municï¿½pio': 'Municipio', 'COBRADE': 'Desastre', 'Populaï¿½ï¿½o': 'Populacao'})
df['Desastre'] = df['Desastre'].str[:5]
df['Data'] = pd.to_datetime(df['Registro'], format='%d/%m/%Y')
df = df.drop(['Registro'], axis=1)
df = df.reset_index(drop=True)

In [129]:
df

Unnamed: 0,UF,Desastre,DH_Mortos,DH_Feridos_Enfermos,DH_Desabrigados_Desalojados,Data
0,RJ,13215,0.0,0.0,0.0,2016-01-05
1,RS,12100,0.0,0.0,280.0,2015-12-30
2,RS,12100,0.0,0.0,0.0,2015-12-30
3,SC,12200,0.0,0.0,0.0,2016-01-05
4,RS,13214,0.0,0.0,0.0,2015-12-30
...,...,...,...,...,...,...
64057,PR,15110,0.0,254.0,0.0,2024-04-05
64058,CE,13214,1.0,0.0,0.0,2024-01-10
64059,BA,14110,0.0,0.0,0.0,2024-01-18
64060,RN,11410,0.0,0.0,0.0,2024-01-01


## 2. Computation of Frequent Eventsets:

- Remove "itemsets" with "Desastre" type count less than min_sup

In [14]:
min_sup = 0.5/100
count = dict(df['Desastre'].value_counts())
tot = sum(count.values())
disasters_to_drop = set()

for d, c in count.items():
    if c/tot < min_sup:
        disasters_to_drop.add(d)

# doencas virais
disasters_to_drop.add('15110')

df_min_sup = df[~df['Desastre'].isin(disasters_to_drop)]

## 3. Calculating frequent sequences

1. With pycspade
2. Transform into:

```
Your data needs to be in a particular format similar to the following:

1 1 3 8 37 42
1 2 4 4 11 37 42
2 1 2 10 73
2 2 1 72
2 3 3 4 24 77
...
The first number is the sequence index, the second is the event index, the third is the number of elements, followed by the element, space separated

data = [
    [1, 10, [3, 4]],
    [1, 15, [1, 2, 3]],
    [1, 20, [1, 2, 6]],
    [1, 25, [1, 3, 4, 6]],
    [2, 15, [1, 2, 6]],
    [2, 20, [5]],
    [3, 10, [1, 2, 6]],
    [4, 10, [4, 7, 8]],
    [4, 20, [2, 6]],
    [4, 25, [1, 7, 8]]
]

To construct this:
1. Identify all unique items: Extract all unique values from your Desastre column.
2. Populate id-lists: For each unique Desastre item, create a list of (sid, eid) pairs where that item appeared.
Here’s an illustrative partial example of your data in vertical format:

```

In [15]:
d_ids = {}
for id, d in enumerate(df_min_sup['Desastre'].unique()):
    d_ids[d] = id + 1

dates_ids = {}
for id, d in enumerate(df_min_sup.sort_values('Data')['Data'].unique()):
    dates_ids[str(d.astype('datetime64[D]'))] = id + 1

df_min_sup['UF_Municipio'] = df_min_sup['UF'].astype(str) + '-' + df_min_sup['Municipio']
uf_mun_ids = {}
for id, uf_mun in enumerate(df_min_sup['UF_Municipio'].unique()):
    uf_mun_ids[uf_mun] = id + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_min_sup['UF_Municipio'] = df_min_sup['UF'].astype(str) + '-' + df_min_sup['Municipio']


In [16]:
vertical_dataset = {}
for uf_mun in df_min_sup['UF_Municipio'].unique():
    events = pd.DataFrame(df_min_sup[df_min_sup['UF_Municipio'] == uf_mun])
    events['Data'] = events['Data'].astype('datetime64[D]')
    items = events[['Data', 'Desastre']]
    vertical_dataset[uf_mun] = items

In [17]:
max_id = 0

spade_input = []
for uf_city, events in vertical_dataset.items():
    entries = []
    for date in events.sort_values('Data')['Data'].unique():
        disasters = list(set(events[events['Data'] == date]['Desastre'].values))
        np_date = str(pd.Timestamp(date).to_numpy().astype('datetime64[D]'))
        entries.append([uf_mun_ids[uf_city], dates_ids[np_date], disasters])
        # entries.append([uf_mun_ids[uf_city], dates_ids[np_date], list(map(lambda x: d_ids[x], disasters))])
    spade_input += entries

In [19]:
from pycspade.helpers import spade, print_result

In [91]:
print(df_min_sup.head().to_csv())

,UF,Municipio,Desastre,Status,Populacao,DH_Mortos,DH_Feridos_Enfermos,DH_Desabrigados_Desalojados,Data,UF_Municipio
0,RJ,Duque de Caxias,13215,Registro,855046,0.0,0.0,0.0,2016-01-05,RJ-Duque de Caxias
1,RS,Manoel Viana,12100,Reconhecido,7074,0.0,0.0,280.0,2015-12-30,RS-Manoel Viana
2,RS,Paraï¿½so do Sul,12100,Reconhecido,7336,0.0,0.0,0.0,2015-12-30,RS-Paraï¿½so do Sul
3,SC,ï¿½guas Mornas,12200,Registro,5546,0.0,0.0,0.0,2016-01-05,SC-ï¿½guas Mornas
4,RS,Sï¿½o Vicente do Sul,13214,Reconhecido,8440,0.0,0.0,0.0,2015-12-30,RS-Sï¿½o Vicente do Sul



In [83]:
result = spade(data=spade_input, support=0.15)
print_result(result)

fmt_rules = []

for rule in result['mined_objects']:
    fmt_rules.append(list(map(lambda x: str(x.__dict__['elements'][0]), rule.__dict__['items'])))

fmt_rules

   Occurs     Accum   Support    Confid      Lift                                                                         Sequence
      861       861 0.1808444       N/A       N/A                                                                          (12100) 
     1109      1109 0.2329343       N/A       N/A                                                                          (12200) 
      784       784 0.1646713 0.7069432 1.2042063                                                                 (12200)->(13214) 
      794       794 0.1667717       N/A       N/A                                                                          (12300) 
     2795     11164 0.5870615       N/A       N/A                                                                          (13214) 
     1690      3518 0.3549674 0.6046512 1.0299621                                                                 (13214)->(13214) 
     1012      1012 0.2125604 0.5988166 1.0200235                            

[['12100'],
 ['12200'],
 ['12200', '13214'],
 ['12300'],
 ['13214'],
 ['13214', '13214'],
 ['13214', '13214', '13214'],
 ['13214', '14110'],
 ['13215'],
 ['13215', '13214'],
 ['14110'],
 ['14110', '13214'],
 ['14110', '13214', '13214'],
 ['14110', '14110'],
 ['14110', '14110', '13214'],
 ['14110', '14110', '14110'],
 ['14110', '14110', '14110', '13214'],
 ['14110', '14110', '14110', '14110'],
 ['14110', '14110', '14110', '14110', '14110'],
 ['14110', '14110', '14110', '14110', '14110', '14110'],
 ['14110', '14110', '14110', '14110', '14110', '14110', '14110'],
 ['14110', '14120'],
 ['14120'],
 ['14120', '14110'],
 ['14120', '14120'],
 ['14132']]

## 4. Contrast Set Mining

In [84]:
import pandas as pd
from collections import defaultdict

# Load disaster events
df = df_min_sup

# Group disasters per location, ordered by date
location_sequences = df.sort_values("Data").groupby("UF_Municipio")["Desastre"].apply(list)

frequent_sequences = list(filter(lambda x: len(x) > 1, fmt_rules))
sequence_names = [f"Seq_{i}" for i in range(len(frequent_sequences))]

binary_table = defaultdict(dict)
for loc, seq in location_sequences.items():
    for idx, fseq in enumerate(frequent_sequences):
        name = sequence_names[idx]
        # Check if the frequent sequence is a subsequence of this location's disaster list
        def is_subsequence(small, big):
            # 
            it = iter(big)
            ok = all(x in it for x in small)
            return ok

        binary_table[loc][name] = int(is_subsequence(fseq, seq))

# Convert to DataFrame
sequence_df = pd.DataFrame.from_dict(binary_table, orient='index').reset_index().rename(columns={'index': 'UF_Municipio'})

# Merge with original static info (UF, Municipio, Populacao)
static_df = df.drop_duplicates("UF_Municipio")[['UF_Municipio', 'UF', 'Municipio', 'Populacao', 'DH_Mortos', 'DH_Feridos_Enfermos', 'DH_Desabrigados_Desalojados']]
final_df = pd.merge(static_df, sequence_df, on="UF_Municipio")

In [85]:
frequent_sequences_names = list(map(lambda sequence: list(map(lambda single: id_full_disasters[single], sequence)), frequent_sequences))

In [86]:
list(zip(frequent_sequences_names, sequence_names))

[(['Enxurradas', 'Tempestade Local/Convectiva - Chuvas Intensas'], 'Seq_0'),
 (['Tempestade Local/Convectiva - Chuvas Intensas',
   'Tempestade Local/Convectiva - Chuvas Intensas'],
  'Seq_1'),
 (['Tempestade Local/Convectiva - Chuvas Intensas',
   'Tempestade Local/Convectiva - Chuvas Intensas',
   'Tempestade Local/Convectiva - Chuvas Intensas'],
  'Seq_2'),
 (['Tempestade Local/Convectiva - Chuvas Intensas', 'Estiagem'], 'Seq_3'),
 (['Tempestade Local/Convectiva - Vendaval',
   'Tempestade Local/Convectiva - Chuvas Intensas'],
  'Seq_4'),
 (['Estiagem', 'Tempestade Local/Convectiva - Chuvas Intensas'], 'Seq_5'),
 (['Estiagem',
   'Tempestade Local/Convectiva - Chuvas Intensas',
   'Tempestade Local/Convectiva - Chuvas Intensas'],
  'Seq_6'),
 (['Estiagem', 'Estiagem'], 'Seq_7'),
 (['Estiagem', 'Estiagem', 'Tempestade Local/Convectiva - Chuvas Intensas'],
  'Seq_8'),
 (['Estiagem', 'Estiagem', 'Estiagem'], 'Seq_9'),
 (['Estiagem',
   'Estiagem',
   'Estiagem',
   'Tempestade Local/Co

In [87]:
import pysubgroup as ps

def subgroup(sequence):
    target = ps.BinaryTarget(sequence, 1)
    search_space = ps.create_selectors(final_df, ignore=['UF_Municipio', *sequence_names])  # Could add Populacao bins, Municipio, etc.
    
    task = ps.SubgroupDiscoveryTask(
        final_df,
        target,
        search_space,
        result_set_size=5,
        depth=5,
        qf=ps.WRAccQF()
    )
    
    result = ps.DFS().execute(task)
    for r in list(result.to_dataframe()['subgroup']):
        print(r)

In [88]:
for i in range(len(sequence_names)):
    print(f"====={frequent_sequences_names[i]}======")
    print(subgroup(sequence_names[i]))

UF=='RS'
DH_Mortos: [0.0:1.0[ AND UF=='RS'
DH_Feridos_Enfermos: [0.0:1.0[ AND UF=='RS'
DH_Feridos_Enfermos: [0.0:1.0[ AND DH_Mortos: [0.0:1.0[ AND UF=='RS'
UF=='SC'
None
DH_Desabrigados_Desalojados>=20.0
UF=='RS'
UF=='MG'
DH_Mortos: [0.0:1.0[ AND UF=='RS'
DH_Feridos_Enfermos: [0.0:1.0[ AND UF=='RS'
None
UF=='SC'
DH_Mortos: [0.0:1.0[ AND UF=='SC'
DH_Desabrigados_Desalojados>=20.0
UF=='MG'
DH_Feridos_Enfermos: [0.0:1.0[ AND UF=='SC'
None
UF=='SC'
DH_Mortos: [0.0:1.0[ AND UF=='SC'
DH_Feridos_Enfermos: [0.0:1.0[ AND UF=='SC'
DH_Feridos_Enfermos: [0.0:1.0[ AND DH_Mortos: [0.0:1.0[ AND UF=='SC'
DH_Desabrigados_Desalojados: [0.0:1.0[ AND DH_Mortos: [0.0:1.0[ AND UF=='BA'
None
UF=='SC'
DH_Mortos: [0.0:1.0[ AND UF=='SC'
DH_Feridos_Enfermos: [0.0:1.0[ AND UF=='SC'
DH_Feridos_Enfermos: [0.0:1.0[ AND DH_Mortos: [0.0:1.0[ AND UF=='SC'
UF=='RS'
None
UF=='RS'
DH_Mortos: [0.0:1.0[ AND UF=='RS'
DH_Feridos_Enfermos: [0.0:1.0[ AND UF=='RS'
DH_Feridos_Enfermos: [0.0:1.0[ AND DH_Mortos: [0.0:1.0[ AND UF=='

In [94]:
frequent_sequences

[['12200', '13214'],
 ['13214', '13214'],
 ['13214', '13214', '13214'],
 ['13214', '14110'],
 ['13215', '13214'],
 ['14110', '13214'],
 ['14110', '13214', '13214'],
 ['14110', '14110'],
 ['14110', '14110', '13214'],
 ['14110', '14110', '14110'],
 ['14110', '14110', '14110', '13214'],
 ['14110', '14110', '14110', '14110'],
 ['14110', '14110', '14110', '14110', '14110'],
 ['14110', '14110', '14110', '14110', '14110', '14110'],
 ['14110', '14110', '14110', '14110', '14110', '14110', '14110'],
 ['14110', '14120'],
 ['14120', '14110'],
 ['14120', '14120']]

## 5. Subgroup Discovery

After calculating the frequent sequences, we must find some subgroups. The example target variables I thought:


> - Subsequence of Disaster Events → Unusual Occurrence of a Specific Disaster Type: "Which cities frequently experience <Drought → Wildfire> sequences and then show an unusually high (or low) probability of also experiencing a Heatwave in the subsequent period compared to other cities?"

> -  Subsequence of Disaster Events → Unusual Severity/Impact: "Which cities frequently experience <Flood → Landslide> sequences, and these sequences are associated with an unusually high Number of Displaced People compared to the average across all flood-landslide sequences?"

> - City/State Attributes + Subsequence → Unusual Trend: "Are there specific sequences of events (e.g., <Heavy Rain → Flash Flood>) that are unusually prevalent in cities within a certain State or Region (e.g., Amazonian States vs. Northeastern States) and show a distinct seasonal pattern (Time Window, Month)?"

In [124]:
SEQ = frequent_sequences[5]
target_col = "DH_Desabrigados_Desalojados"

rows = []

df = df_min_sup

for loc, group in df.groupby("UF_Municipio"):
    events = group.sort_values("Data")
    disasters = events["Desastre"].tolist()
    dates = events["Data"].tolist()
    indices = events.index.tolist()

    for i in range(len(disasters) - len(SEQ) + 1):
        if disasters[i:i+len(SEQ)] == SEQ:
            last_idx = indices[i + len(SEQ) - 1]
            row = events.loc[last_idx].copy()
            row["Match_Start"] = dates[i]
            row["Match_End"] = dates[i+len(SEQ)-1]
            rows.append(row)

df_seq2 = pd.DataFrame(rows)

In [116]:
df_seq2

Unnamed: 0,UF,Municipio,Desastre,Status,Populacao,DH_Mortos,DH_Feridos_Enfermos,DH_Desabrigados_Desalojados,Data,UF_Municipio,Match_Start,Match_End
47675,AL,Belï¿½m,13214,Reconhecido,4551,0.0,0.0,4.0,2022-05-26,AL-Belï¿½m,2022-05-05,2022-05-26
47202,AL,Branquinha,13214,Reconhecido,10586,0.0,0.0,1700.0,2022-07-04,AL-Branquinha,2017-02-19,2022-07-04
47605,AL,Coitï¿½ do Nï¿½ia,13214,Reconhecido,10926,0.0,0.0,26.0,2022-05-26,AL-Coitï¿½ do Nï¿½ia,2022-02-24,2022-05-26
47153,AL,Dois Riachos,13214,Nï¿½o reconhecido,10879,0.0,0.0,25.0,2022-07-09,AL-Dois Riachos,2022-01-06,2022-07-09
47402,AL,Estrela de Alagoas,13214,Reconhecido,17254,0.0,0.0,20.0,2022-06-15,AL-Estrela de Alagoas,2022-01-13,2022-06-15
...,...,...,...,...,...,...,...,...,...,...,...,...
48090,SC,Xaxim,13214,Registro,25697,0.0,0.0,0.0,2022-05-11,SC-Xaxim,2021-12-22,2022-05-11
48119,SC,ï¿½gua Doce,13214,Registro,6960,0.0,0.0,0.0,2022-05-05,SC-ï¿½gua Doce,2022-01-06,2022-05-05
53221,SC,ï¿½guas Frias,13214,Registro,2424,0.0,0.0,0.0,2023-11-17,SC-ï¿½guas Frias,2022-01-12,2023-11-17
20381,SE,Pinhï¿½o,13214,Registro,5973,0.0,0.0,0.0,2020-11-10,SE-Pinhï¿½o,2020-10-14,2020-11-10


In [125]:
target = ps.NumericTarget(target_col)
search_space = ps.create_selectors(df_seq2, ignore=[
    "Desastre", "Status", "Data", "UF_Municipio", "Match_Start", "Match_End"
])

qf = ps.StandardQFNumeric(a=1)
task = ps.SubgroupDiscoveryTask(
    data=df_seq2,
    target=target,
    search_space=search_space,
    result_set_size=100,
    depth=4,
    qf=qf,
)

result = ps.DFS().execute(task)

# global_mean = df_seq2["DH_Desabrigados_Desalojados"].mean()
# for sg in result[:10]:
#     if sg.statistics['mean_target'] > global_mean:
#         print(sg.subgroup_description, sg.quality, sg.statistics)

# result = ps.BeamSearch().execute(task)
result.to_dataframe()

  statistics["median_sg"] / statistics["median_dataset"]


Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,98503.273270,DH_Desabrigados_Desalojados>=87.0,169,838,739.378698,156.519093,1151.672696,594.630413,314.0,0.0,8460.0,8460.0,87.0,0.0,4.723888,inf
1,86549.693317,DH_Desabrigados_Desalojados>=87.0 AND DH_Morto...,147,838,745.292517,156.519093,1196.487099,594.630413,304.0,0.0,8460.0,8460.0,87.0,0.0,4.761672,inf
2,65237.403341,DH_Desabrigados_Desalojados>=87.0 AND DH_Ferid...,136,838,636.205882,156.519093,933.006487,594.630413,290.0,0.0,8460.0,8460.0,87.0,0.0,4.064717,inf
3,59869.556086,DH_Desabrigados_Desalojados>=87.0 AND DH_Ferid...,128,838,624.250000,156.519093,940.892753,594.630413,278.0,0.0,8460.0,8460.0,87.0,0.0,3.988331,inf
4,40612.816229,DH_Desabrigados_Desalojados>=87.0 AND UF=='BA',62,838,811.564516,156.519093,946.256284,594.630413,325.0,0.0,4212.0,8460.0,90.0,0.0,5.185083,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,8303.480907,DH_Desabrigados_Desalojados>=87.0 AND Municipi...,1,838,8460.000000,156.519093,0.000000,594.630413,8460.0,0.0,8460.0,8460.0,8460.0,0.0,54.050914,inf
96,8303.480907,DH_Desabrigados_Desalojados>=87.0 AND DH_Morto...,1,838,8460.000000,156.519093,0.000000,594.630413,8460.0,0.0,8460.0,8460.0,8460.0,0.0,54.050914,inf
97,8303.480907,DH_Desabrigados_Desalojados>=87.0 AND DH_Morto...,1,838,8460.000000,156.519093,0.000000,594.630413,8460.0,0.0,8460.0,8460.0,8460.0,0.0,54.050914,inf
98,8303.480907,DH_Desabrigados_Desalojados>=87.0 AND DH_Morto...,1,838,8460.000000,156.519093,0.000000,594.630413,8460.0,0.0,8460.0,8460.0,8460.0,0.0,54.050914,inf
