In [174]:
import numpy as np
import pandas as pd
import polars as pl
pl.Config.set_tbl_rows(30)
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from lets_plot import *
LetsPlot.setup_html()

In [260]:
data = pl.read_parquet('../data/dataset_complete_dengue_municipality.parquet')

In [262]:
gdf = data.group_by(['year','geocode','uf']).agg(pl.col('casos').sum().alias('casos'))
gdf = gdf.join(gdf.group_by(['year','uf']).agg(pl.col('casos').sum().alias('casos_uf')),on=['uf','year'])
gdf = gdf.with_columns(
    np.log1p(pl.col('casos')).alias('log_casos'),
    (pl.col('casos')/pl.col('casos_uf')).alias('casos_percent_uf')
)
gdf = gdf.with_columns(
    pl.col("casos_percent_uf")
    .rank("ordinal", descending=True)
    .over(["uf", "year"])
    .alias("rank_within_uf_year")
)

gdf = gdf.sort(['uf','year','casos_percent_uf'],descending=True).with_columns(
    pl.col("casos_percent_uf")
    .cum_sum()
    .over(["uf", "year"])
    .alias("cumulative_share")
)

top_municipalities = gdf.filter(pl.col("cumulative_share") <= 0.9)

In [263]:
gdf.filter(pl.col('uf')=='CE',pl.col('year')==2020).sort('casos')

year,geocode,uf,casos,casos_uf,log_casos,casos_percent_uf,rank_within_uf_year,cumulative_share
i32,i64,str,i64,i64,f64,f64,u32,f64
2020,2313609,"""CE""",0,23966,0.0,0.0,172,1.0
2020,2306306,"""CE""",0,23966,0.0,0.0,173,1.0
2020,2307908,"""CE""",0,23966,0.0,0.0,179,1.0
2020,2304269,"""CE""",0,23966,0.0,0.0,171,1.0
2020,2305209,"""CE""",0,23966,0.0,0.0,177,1.0
2020,2305308,"""CE""",0,23966,0.0,0.0,178,1.0
2020,2304509,"""CE""",0,23966,0.0,0.0,174,1.0
2020,2313906,"""CE""",0,23966,0.0,0.0,183,1.0
2020,2310100,"""CE""",0,23966,0.0,0.0,180,1.0
2020,2304236,"""CE""",0,23966,0.0,0.0,181,1.0


In [264]:
gdf.filter(pl.col('year')==2020).group_by('uf').agg(
    pl.col('casos').sum()
)

uf,casos
str,i64
"""MG""",80864
"""PE""",20021
"""AP""",59
"""RJ""",4171
"""SP""",201902
"""CE""",23966
"""PI""",2199
"""RN""",6817
"""AM""",5622
"""RR""",445


In [265]:
top_municipalities.filter(pl.col('uf')=='CE',pl.col('year')==2021).sort('casos')

year,geocode,uf,casos,casos_uf,log_casos,casos_percent_uf,rank_within_uf_year,cumulative_share
i32,i64,str,i64,i64,f64,f64,u32,f64
2021,2310001,"""CE""",124,35555,4.828314,0.003488,37,0.897427
2021,2310803,"""CE""",126,35555,4.844187,0.003544,36,0.890395
2021,2303956,"""CE""",126,35555,4.844187,0.003544,35,0.893939
2021,2303501,"""CE""",148,35555,5.003946,0.004163,34,0.886851
2021,2306207,"""CE""",150,35555,5.01728,0.004219,33,0.882689
2021,2313302,"""CE""",159,35555,5.075174,0.004472,32,0.87847
2021,2310407,"""CE""",162,35555,5.09375,0.004556,31,0.873998
2021,2300200,"""CE""",164,35555,5.105945,0.004613,30,0.869442
2021,2313559,"""CE""",167,35555,5.123964,0.004697,29,0.864829
2021,2309607,"""CE""",186,35555,5.231109,0.005231,28,0.860132


In [164]:
# Count how many years each municipality is in the top list
stability = top_municipalities.filter(pl.col('year')>2015).group_by(['uf','geocode']).agg(
    pl.col('year').count().alias('years_in_top'),
    pl.col('casos').mean().alias('median_casos')
)
stability

uf,geocode,years_in_top,median_casos
str,i64,u32,f64
"""RO""",1101401,4,137.5
"""BA""",2905602,3,356.666667
"""MS""",5002407,4,821.0
"""SP""",3556206,6,1473.666667
"""RN""",2402808,1,90.0
…,…,…,…
"""AL""",2702355,3,122.333333
"""SP""",3533205,2,488.0
"""RJ""",3301504,2,1955.0
"""SP""",3530508,6,1142.0


In [165]:
stability.filter(pl.col('uf')=='CE').sort('years_in_top').tail(10)

uf,geocode,years_in_top,median_casos
str,i64,u32,f64
"""CE""",2301109,7,235.142857
"""CE""",2304202,7,528.142857
"""CE""",2305506,7,383.571429
"""CE""",2311801,7,710.428571
"""CE""",2311405,7,431.428571
"""CE""",2305407,7,534.714286
"""CE""",2307601,7,652.285714
"""CE""",2302503,9,839.444444
"""CE""",2304400,10,10767.9
"""CE""",2303709,10,553.7


In [101]:
# gdf
# gdf.filter(pl.col('uf')=='CE',pl.col('year')==2021).sort('casos').sort('casos_percent_uf').with_columns(
# gdf.filter(pl.col('uf')=='CE',pl.col('year')==2021).with_columns(
#     pl.col('casos_percent_uf').sort()
#         # .cum_sum()
#         # .over(['uf','year'])
#         .alias('test')
# )

year,geocode,uf,casos,casos_uf,log_casos,casos_percent_uf,rank_within_uf_year,cumulative_share
i64,i64,str,i64,i64,f64,f64,u32,f64
2021,2312601,"""CE""",0,35555,0.0,0.0,177,0.0
2021,2304806,"""CE""",0,35555,0.0,0.0,169,0.0
2021,2313252,"""CE""",0,35555,0.0,0.0,171,0.0
2021,2300606,"""CE""",0,35555,0.0,0.0,170,0.0
2021,2305704,"""CE""",0,35555,0.0,0.0,181,0.0
…,…,…,…,…,…,…,…,…
2021,2307700,"""CE""",1089,35555,6.993933,0.030629,5,0.448938
2021,2312908,"""CE""",1099,35555,7.003065,0.03091,4,0.479848
2021,2307650,"""CE""",1325,35555,7.189922,0.037266,3,0.517114
2021,2311801,"""CE""",2094,35555,7.647309,0.058895,2,0.576009


In [78]:

# Filter to those that cumulatively explain <= 80% of cases
# top_municipalities = gdf.filter(pl.col("cumulative_share") <= 0.8)

year,geocode,uf,casos,casos_uf,log_casos,casos_percent_uf,rank_within_uf_year,cumulative_share
i64,i64,str,i64,i64,f64,f64,u32,f64
2024,4123956,"""PR""",122,663423,4.812184,0.000184,315,0.081379
2014,4311429,"""RS""",0,154,0.0,0.0,58,0.220779
2013,2804458,"""SE""",0,836,0.0,0.0,49,0.545455
2018,2105104,"""MA""",0,2130,0.0,0.0,103,0.351174
2011,3505708,"""SP""",510,107954,6.23637,0.004724,39,0.235526
…,…,…,…,…,…,…,…,…
2013,2111953,"""MA""",21,3562,3.091042,0.005896,30,1.0
2013,2307635,"""CE""",64,30338,4.174387,0.00211,74,1.0
2017,4305959,"""RS""",0,170,0.0,0.0,496,1.0
2014,3506805,"""SP""",49,229145,3.912023,0.000214,206,1.0


year,geocode,uf,casos,casos_uf,log_casos,casos_percent_uf,rank_within_uf_year,cumulative_share
i64,i64,str,i64,i64,f64,f64,u32,f64
2021,2300606,"""CE""",0,35555,0.0,0.0,168,0.627085
2021,2301703,"""CE""",0,35555,0.0,0.0,169,0.756349
2021,2304806,"""CE""",0,35555,0.0,0.0,170,0.8162
2021,2301307,"""CE""",0,35555,0.0,0.0,171,0.87847
2021,2300408,"""CE""",0,35555,0.0,0.0,172,0.972296
…,…,…,…,…,…,…,…,…
2021,2307700,"""CE""",1089,35555,6.993933,0.030629,5,1.0
2021,2312908,"""CE""",1099,35555,7.003065,0.03091,4,0.919392
2021,2307650,"""CE""",1325,35555,7.189922,0.037266,3,0.605316
2021,2311801,"""CE""",2094,35555,7.647309,0.058895,2,0.971003


In [70]:
# data['geocode'].unique()

In [56]:
(
    ggplot(data=gdf.filter(pl.col('uf')=='CE'))
    + geom_histogram(
        mapping=aes(x='percent_uf')
    )
    + facet_grid(y='year')
)

In [50]:
(
    ggplot(data=gdf.filter(pl.col('uf')=='CE'))
    + geom_point(
        mapping=aes(x='geocode', y='log_casos'),
    )
    + facet_grid(y='year')
)