In [6]:
import numpy as np
import pandas as pd
from epiweeks import Week 
from scipy.stats import boxcox
import matplotlib.pyplot as plt 

Nesse notebook são organizados os datasets onde cada coluna representa os casos em uma regional de saúde com uma coluna com os casos do estado que será usada como target no algoritmo de regressão:

In [7]:
def transform_epiweek_label(ep_label):
  '''
  Function to change the epiweeks 41-40 into 1-52.
  '''
  year_ep_label = int(ep_label[:4])
  week_ep_label = int(ep_label[4:])

  if week_ep_label <=40:
    new_week = 12 + week_ep_label
    new_year = year_ep_label

  else:
    new_week = week_ep_label - 40
    new_year = year_ep_label +1

  if new_week <=9:
    week_str = f'0{new_week}'
  else:
    week_str = str(new_week)

  new_ep_label = f'{new_year}{week_str}'

  return new_ep_label

def add_epiweek_label(df_w):
    '''
    This function assumes that the dataframe has a datetime index
    and add the epiweek and year value
    '''

    df_w['epiweek_label'] = [Week.fromdate(x) for x in df_w.index]

    df_w['epiweek_label'] = df_w['epiweek_label'].astype(str)

    df_w = df_w.loc[df_w.epiweek_label.str[-2:].astype(int) != 53]

    df_w['epiweek_label'] = df_w['epiweek_label'].apply(transform_epiweek_label)

    df_w['epiweek'] = df_w['epiweek_label'].astype(str).str[-2:].astype(int)
    df_w['year'] = df_w['epiweek_label'].astype(str).str[:4].astype(int)

    return df_w


In [8]:
def aggregate_data(df, geocode = None, column = 'geocode'):
  '''
  Função para agregar os dados a partir de um geocode específico, se o geocode não
  é fornecido os dados são agregados para todo o estado.
  '''

  if geocode is not None:

    df = df.loc[df[column] == geocode]

  df_w = df[['casos']]

  df_w = df_w.resample('W-SUN').sum()

  #df_w = add_epiweek_label(df_w)

  return df_w

In [9]:
enso = pd.read_csv('enso.csv.gz')

enso.date = pd.to_datetime(enso.date)

enso.set_index('date', inplace = True)

enso = enso.loc[enso.index.year >= 2010]

enso = enso.resample('W-SUN').mean()  # Resample to monthly frequency and calculate mean

enso = enso.ffill()

enso.enso = enso.enso + 2

enso

Unnamed: 0_level_0,enso
date,Unnamed: 1_level_1
2010-01-10,3.230226
2010-01-17,3.070318
2010-01-24,2.957734
2010-01-31,3.210819
2010-02-07,3.210819
...,...
2024-06-02,2.130621
2024-06-09,2.245672
2024-06-16,2.238217
2024-06-23,2.203047


In [10]:
df_all = pd.read_csv('./dengue_update2.csv.gz', index_col ='Unnamed: 0')

df_all.date = pd.to_datetime(df_all.date)

df_all.set_index('date', inplace = True)

In [11]:
df_all.index.max()

Timestamp('2024-09-15 00:00:00')

In [12]:
code_to_state = {33: 'RJ', 32: 'ES', 41: 'PR', 23: 'CE', 21: 'MA',
 31: 'MG', 42: 'SC', 26: 'PE', 25: 'PB', 24: 'RN', 22: 'PI', 27: 'AL',
 28: 'SE', 35: 'SP', 43: 'RS', 15: 'PA', 16: 'AP', 14: 'RR',  11: 'RO',
 13: 'AM', 12: 'AC', 51: 'MT', 50: 'MS', 52: 'GO', 17: 'TO', 53: 'DF',
 29: 'BA'}

In [13]:

for state in code_to_state.values():
    df = df_all.loc[df_all.uf == state]
    
    df_reg = df[['regional_geocode', 'casos']].groupby('regional_geocode').resample('W-SUN').sum().drop(['regional_geocode'], axis =1).reset_index()
    
    df_reg = df_reg.pivot(index = 'date', columns = 'regional_geocode', values = 'casos')
    df_reg.columns = 'casos_' + df_reg.columns.astype(str)
    
    df_state = df[['casos']].resample('W-SUN').sum()
    #df_state = df_state.rename(columns  = {'casos': f'casos_{state}'})
    
    df_reg = df_reg.merge(df_state, left_index = True, right_index = True)
    
    for col in df_reg.columns: 
     
        df_reg[col] = boxcox(df_reg[col]+1, lmbda = 0.05) 
    
    df_reg =  add_epiweek_label(df_reg)
    #df_reg = df_reg.merge(enso, left_index = True, right_index = True)
    del df_reg['epiweek_label']
    
    df_reg.to_csv(f'dengue_{state}.csv.gz')

df_reg.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_w['epiweek_label'] = df_w['epiweek_label'].apply(transform_epiweek_label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_w['epiweek'] = df_w['epiweek_label'].astype(str).str[-2:].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_w['year'] = df_w['epiweek_label'].astype(str).str[:4

Unnamed: 0_level_0,casos_29001,casos_29002,casos_29003,casos_29004,casos_29005,casos_29006,casos_29007,casos_29008,casos_29009,casos_29010,...,casos_29022,casos_29023,casos_29024,casos_29025,casos_29026,casos_29027,casos_29028,casos,epiweek,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-03,0.0,2.322463,4.85161,3.342702,0.0,4.480695,1.435469,0.705298,3.444477,5.943799,...,0.0,1.435469,3.043712,4.437014,2.547564,0.705298,2.191389,7.773044,13,2010
2010-01-10,1.435469,1.675968,5.559166,3.043712,1.129346,3.924624,2.645873,0.0,3.342702,6.196882,...,0.0,0.0,3.492379,4.245709,2.736686,0.0,2.043721,7.917971,14,2010
2010-01-17,1.675968,2.043721,5.178508,2.645873,0.705298,4.768324,2.440369,0.705298,3.582953,6.111301,...,1.435469,2.043721,3.109663,4.165881,3.492379,0.705298,2.322463,8.010533,15,2010
2010-01-24,1.435469,1.874471,4.974738,3.538493,1.435469,4.459039,3.28847,0.0,3.342702,5.965015,...,0.705298,1.675968,3.667364,5.018089,3.957422,1.874471,1.675968,8.09449,16,2010
2010-01-31,1.874471,2.899951,5.032227,3.043712,0.0,4.802279,4.522943,0.0,2.547564,6.092248,...,0.0,2.821091,3.989388,5.366053,4.697639,1.435469,2.547564,8.408747,17,2010


In [15]:
df_reg.index.max()

Timestamp('2024-09-15 00:00:00')