In [1]:
import polars as pl

In [2]:
queimadas_df = pl.read_parquet("data/queimadas-full.pqt.zstd")
municipios_df = pl.read_csv("data/municipios.csv", separator=";",encoding='iso-8859-1')
uf_df = pl.read_csv("data/uf.csv")

In [3]:
municipios_df.head()

CÓDIGO DO MUNICÍPIO - TOM,CÓDIGO DO MUNICÍPIO - IBGE,MUNICÍPIO - TOM,MUNICÍPIO - IBGE,UF
i64,i64,str,str,str
1,1100106,"""GUAJARÁ-MIRIM""","""Guajará-Mirim""","""RO"""
2,1100379,"""ALTO ALEGRE DOS PARECIS""","""Alto Alegre dos Parecis""","""RO"""
3,1100205,"""PORTO VELHO""","""Porto Velho""","""RO"""
4,1100452,"""BURITIS""","""Buritis""","""RO"""
5,1100122,"""JI-PARANÁ""","""Ji-Paraná""","""RO"""


In [4]:
uf_df.head()

id_uf,sigla,nome,regiao
i64,str,str,str
42,"""SC""","""Santa Catarina""","""Sul"""
41,"""PR""","""Paraná""","""Sul"""
43,"""RS""","""Rio Grande do Sul""","""Sul"""
11,"""RO""","""Rondônia""","""Norte"""
13,"""AM""","""Amazonas""","""Norte"""


In [5]:
queimadas_df = queimadas_df.filter(pl.col('bioma').is_not_null())
queimadas_df = queimadas_df.with_columns([pl.col('dias_sem_chuva').cast(pl.Int64)])
queimadas_df = queimadas_df.with_columns([pl.col('dias_sem_chuva').replace(-999,None)])
queimadas_df = queimadas_df.with_columns([pl.col('risco_fogo').replace(-999,None)])
queimadas_df

ano,mes,data_hora,bioma,sigla_uf,id_municipio,latitude,longitude,satelite,dias_sem_chuva,precipitacao,risco_fogo,potencia_radiativa_fogo
i64,i64,datetime[μs],str,str,str,f64,f64,str,i64,f64,f64,f64
2008,1,2008-01-01 15:54:00,"""Mata Atlântica""","""BA""","""2900801""",-17.406,-39.387,,,,,
2008,1,2008-01-01 15:55:00,"""Caatinga""","""AL""","""2702306""",-10.085,-36.342,,,,,
2008,1,2008-01-01 15:55:00,"""Caatinga""","""SE""","""2805604""",-9.931,-37.239,,,,,
2008,1,2008-01-01 15:55:00,"""Caatinga""","""SE""","""2807402""",-10.971,-38.002,,,,,
2008,1,2008-01-01 15:55:00,"""Caatinga""","""SE""","""2807402""",-10.969,-37.988,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…
2003,12,2003-12-31 16:33:00,"""Amazônia""","""MA""","""2109239""",-2.291,-45.786,,,,,
2003,12,2003-12-31 16:33:00,"""Amazônia""","""MA""","""2109239""",-2.289,-45.777,,,,,
2003,12,2003-12-31 16:33:00,"""Amazônia""","""MA""","""2109239""",-2.28,-45.778,,,,,
2003,12,2003-12-31 16:33:00,"""Amazônia""","""MA""","""2110039""",-2.613,-45.984,,,,,


In [6]:
# generate dim_horarios_queimada
# generate all combinations of hour and minutes in a day
minuto_list = 24*[list(range(0, 60))]
dim_horarios_full = pl.DataFrame({
    "hora": list(range(0, 24)),
    "minuto": minuto_list}).explode("minuto").with_row_index("id_horario").select([
        pl.col("id_horario").cast(pl.Int32),
        pl.col("hora").cast(pl.Int8),
        pl.col("minuto").cast(pl.Int8)
    ])
dim_horarios_full

id_horario,hora,minuto
i32,i8,i8
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
…,…,…
1435,23,55
1436,23,56
1437,23,57
1438,23,58


In [7]:
# create dim_data_queimada
dim_data_queimada = (queimadas_df
    .select([pl.col("data_hora").alias("date_time_iso"),
             pl.col("data_hora").dt.day().alias("dia"), 
             pl.col("data_hora").dt.month().alias("mes"), 
             pl.col("data_hora").dt.year().alias("ano"),
             # generate semester column
             pl.col("data_hora").dt.month().map_elements(lambda x: 1 if x <= 6 else 2).alias("semestre"),
             # generate trimester column
             ((pl.col("data_hora").dt.month() - 1 )//3 + 1).alias("trimestre"),
             # generate week day column
             pl.col("data_hora").dt.weekday().alias("dia_semana"),
             # generate day of year column
             pl.col("data_hora").dt.ordinal_day().alias("dia_ano"),
             # generate is_weekend column, starting from saturday (6)
             pl.col("data_hora").dt.weekday().map_elements(lambda x: x >= 6).alias("is_weekend"),
             # generate week of year column
             pl.col("data_hora").dt.week().alias("semana_ano")
             ]).unique().sort("date_time_iso")).with_row_index("id_data")
print(len(dim_data_queimada))
dim_data_queimada

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("data_hora").map_elements(lambda x: ...)
with this one instead:
  + pl.col("data_hora") >= 6

  pl.col("data_hora").dt.weekday().map_elements(lambda x: x >= 6).alias("is_weekend"),


1072406


id_data,date_time_iso,dia,mes,ano,semestre,trimestre,dia_semana,dia_ano,is_weekend,semana_ano
u32,datetime[μs],i8,i8,i32,i64,i8,i8,i16,bool,i8
0,2003-01-01 16:04:00,1,1,2003,1,1,3,1,false,1
1,2003-01-01 16:05:00,1,1,2003,1,1,3,1,false,1
2,2003-01-01 16:06:00,1,1,2003,1,1,3,1,false,1
3,2003-01-01 16:07:00,1,1,2003,1,1,3,1,false,1
4,2003-01-01 16:08:00,1,1,2003,1,1,3,1,false,1
…,…,…,…,…,…,…,…,…,…,…
1072401,2025-02-12 22:20:00,12,2,2025,1,1,3,43,false,7
1072402,2025-02-12 22:30:00,12,2,2025,1,1,3,43,false,7
1072403,2025-02-12 22:50:00,12,2,2025,1,1,3,43,false,7
1072404,2025-02-12 23:00:00,12,2,2025,1,1,3,43,false,7


In [8]:
# add season to dim_data_queimada using day and month
# AAAAAAAAAAAAAA there is no built-in season function in polars
# so we have to do it manually
# Summer - 1: Dec 21 - Mar 19
# Autumn - 2: Mar 20 - Jun 20
# Winter - 3: Jun 21 - Sep 21
# Spring - 4: Sep 22 - Dec 20
dim_data_queimada = dim_data_queimada.with_columns(
    pl.when(( (pl.col("mes") == 12) & (pl.col("dia") >= 21) ) | (pl.col("mes").is_in([1,2])) | ((pl.col("mes") == 3) & (pl.col("dia") < 20)))
    .then(1)
    .when(( (pl.col("mes") == 3) & (pl.col("dia") >= 20) ) | (pl.col("mes").is_in([4,5])) | ((pl.col("mes") == 6) & (pl.col("dia") < 21)))
    .then(2)
    .when(( (pl.col("mes") == 6) & (pl.col("dia") >= 21) ) | (pl.col("mes").is_in([7,8])) | ((pl.col("mes") == 9) & (pl.col("dia") < 22)))
    .then(3)
    .otherwise(4).alias("estacao")
)
dim_data_queimada.head()

id_data,date_time_iso,dia,mes,ano,semestre,trimestre,dia_semana,dia_ano,is_weekend,semana_ano,estacao
u32,datetime[μs],i8,i8,i32,i64,i8,i8,i16,bool,i8,i32
0,2003-01-01 16:04:00,1,1,2003,1,1,3,1,False,1,1
1,2003-01-01 16:05:00,1,1,2003,1,1,3,1,False,1,1
2,2003-01-01 16:06:00,1,1,2003,1,1,3,1,False,1,1
3,2003-01-01 16:07:00,1,1,2003,1,1,3,1,False,1,1
4,2003-01-01 16:08:00,1,1,2003,1,1,3,1,False,1,1


In [9]:
# generate dim_local_queimada
dim_local_queimada = (queimadas_df
    .select([pl.col("id_municipio").alias("id_municipio").cast(pl.Int32),
             pl.col("sigla_uf").alias("sigla_uf"),
             pl.col("bioma").alias("bioma"),
             pl.col("latitude").alias("latitude"),
             pl.col("longitude").alias("longitude"),]).unique().sort(["id_municipio", "sigla_uf"]))
dim_local_queimada.head()

id_municipio,sigla_uf,bioma,latitude,longitude
i32,str,str,f64,f64
1100015,"""RO""","""Amazônia""",-12.93848,-62.64406
1100015,"""RO""","""Amazônia""",-12.9396,-62.702801
1100015,"""RO""","""Amazônia""",-12.91771,-62.74052
1100015,"""RO""","""Amazônia""",-12.145,-62.341
1100015,"""RO""","""Amazônia""",-12.449,-62.024


In [10]:
# join with municipios to get municipio name
# join with uf to get uf name and regiao
dim_local_queimada = (dim_local_queimada
    .join(municipios_df.select([pl.col("MUNICÍPIO - IBGE"), pl.col('CÓDIGO DO MUNICÍPIO - IBGE')]), left_on="id_municipio", right_on="CÓDIGO DO MUNICÍPIO - IBGE", how="left")
    .join(uf_df.select([pl.col("sigla"), pl.col("nome"), pl.col("regiao")]), left_on="sigla_uf", right_on="sigla", how="left"))
dim_local_queimada = dim_local_queimada.select([
    pl.col("id_municipio"),
    pl.col("MUNICÍPIO - IBGE").alias("nome_municipio"),
    pl.col("sigla_uf"),
    pl.col("nome").alias("nome_uf"),
    pl.col("regiao").alias("regiao_uf"),
    pl.col("bioma"),
    pl.col("latitude"),
    pl.col("longitude"),
]).sort(["id_municipio", "sigla_uf"]).unique().with_row_index("id_local")
print(len(dim_local_queimada))
dim_local_queimada

14612826


id_local,id_municipio,nome_municipio,sigla_uf,nome_uf,regiao_uf,bioma,latitude,longitude
u32,i32,str,str,str,str,str,f64,f64
0,1300904,"""Canutama""","""AM""","""Amazonas""","""Norte""","""Amazônia""",-8.4509,-64.02235
1,1503002,"""Faro""","""PA""","""Pará""","""Norte""","""Amazônia""",-1.134,-58.087
2,1704105,"""Centenário""","""TO""","""Tocantins""","""Norte""","""Cerrado""",-9.269,-47.661
3,1100015,"""Alta Floresta D'Oeste""","""RO""","""Rondônia""","""Norte""","""Amazônia""",-12.90656,-62.63863
4,1200344,"""Manoel Urbano""","""AC""","""Acre""","""Norte""","""Amazônia""",-9.03188,-69.44276
…,…,…,…,…,…,…,…,…
14612821,5219803,"""São Domingos""","""GO""","""Goiás""","""Centro-Oeste""","""Cerrado""",-13.50817,-46.28771
14612822,1100940,"""Cujubim""","""RO""","""Rondônia""","""Norte""","""Amazônia""",-9.33955,-62.37102
14612823,2108009,"""Pastos Bons""","""MA""","""Maranhão""","""Nordeste""","""Cerrado""",-6.53551,-43.96024
14612824,5216304,"""Paranaiguara""","""GO""","""Goiás""","""Centro-Oeste""","""Mata Atlântica""",-18.872,-50.715


In [11]:
# generate fct_queimadas

# join by date_time
fct_queimadas = (queimadas_df.join(
    dim_data_queimada.select([pl.col("date_time_iso"), pl.col('id_data')]),
    left_on="data_hora", right_on="date_time_iso", how="left"
).with_columns([pl.col('id_municipio').cast(pl.Int32)])
# join by id_municipio, sigla_uf, latitude, longitude, and bioma
.join(
    dim_local_queimada.select([pl.col("id_local"), pl.col("id_municipio"), pl.col("sigla_uf"), pl.col("latitude"), pl.col("longitude"), pl.col("bioma")]),
    left_on=["id_municipio", "sigla_uf", "latitude", "longitude", "bioma"],
    right_on=["id_municipio", "sigla_uf", "latitude", "longitude", "bioma"],
    how="left"
)
# join dim horarios_queimada to get id_horario
.join(
    dim_horarios_full.select([pl.col("id_horario"), pl.col("hora"), pl.col("minuto")]),
    left_on=[pl.col("data_hora").dt.hour(), pl.col("data_hora").dt.minute()],
    right_on=["hora", "minuto"],
    how="left")
.select([
    pl.col("id_data"),
    pl.col("id_local"),
    pl.col("id_horario"),
    pl.col('precipitacao').alias('precipitacao'),
    pl.col('risco_fogo').alias('risco_fogo'),
    pl.col('potencia_radiativa_fogo').alias('potencia_radiativa_fogo'),
    pl.col('dias_sem_chuva').alias('dias_sem_chuva')
])).unique()
    
print(len(fct_queimadas))
fct_queimadas

17542892


id_data,id_local,id_horario,precipitacao,risco_fogo,potencia_radiativa_fogo,dias_sem_chuva
u32,u32,i32,f64,f64,f64,i64
966816,8545342,999,0.0,1.0,7.1,120
996240,6810330,314,0.0,1.0,3.3,9
334956,13092226,1012,0.0,0.24,21.2,2
278376,8121743,1099,0.0,0.25,6.1,4
3676,10414791,1005,,,,
…,…,…,…,…,…,…
951071,1011939,1057,0.0,1.0,9.7,4
304921,13456912,948,0.0,1.0,10.8,17
851812,3583570,1072,0.0,1.0,12.8,89
328883,4460050,242,0.55,0.26,3.8,0


In [12]:
# load to parquet
dim_horarios_full.write_parquet("data/dim_horarios_queimada.pqt.zstd", compression="zstd")
dim_local_queimada.write_parquet("data/dim_local_queimada.pqt.zstd", compression="zstd")
dim_data_queimada.write_parquet("data/dim_data.pqt.zstd", compression="zstd")
fct_queimadas.write_parquet("data/fct_queimadas.pqt.zstd", compression="zstd")