In [1]:
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt 

Load the nn predictions: 

In [2]:
def load_nn(city, label, ini_date = '2021-12-26', end_date= '2023-07-02', doenca = 'dengue'):
    
    data_nn = pickle.load(open(f'./models/neuralnetworks/predictions/lstm_{city}_{doenca}_{label}.pkl', 'rb'))
    
    
    df_nn = pd.DataFrame()

    ini_index = data_nn['indice'].index(datetime.strptime(ini_date, '%Y-%m-%d'))
    end_index = data_nn['indice'].index(datetime.strptime(end_date, '%Y-%m-%d')) + 1 

    df_nn['dates'] = data_nn['indice'][ini_index:end_index]
    df_nn['target'] = data_nn['target'][ini_index - 7: end_index - 7, -1] * data_nn['factor']
    df_nn['preds']  = (data_nn['pred'].iloc[ini_index - 7: end_index - 7,-1] * data_nn['factor']).values
    df_nn['lb']  = (data_nn['lb'].iloc[ini_index - 7: end_index - 7,-1] * data_nn['factor']).values
    df_nn['ub']  = (data_nn['ub'].iloc[ini_index - 7: end_index - 7,-1] * data_nn['factor']).values
    
    df_nn.set_index('dates', inplace = True)
    
    df_nn.index = pd.to_datetime(df_nn.index)

    return df_nn 

In [3]:
city = 2304400
label = 'single'

df_nn = load_nn(city, label)

df_nn.head()

Unnamed: 0_level_0,target,preds,lb,ub
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-12-26,54.0,273.031378,119.709463,455.103028
2022-01-02,212.0,388.237005,211.350268,684.223565
2022-01-09,236.0,591.635949,269.032851,959.501081
2022-01-16,174.0,409.682168,214.666462,694.70395
2022-01-23,115.0,307.878775,156.810339,486.818191


In [4]:
df_nn.tail()

Unnamed: 0_level_0,target,preds,lb,ub
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-06-04,313.0,328.943503,179.992313,476.149246
2023-06-11,263.0,255.044869,145.809145,402.894541
2023-06-18,284.0,298.169323,169.755065,471.05398
2023-06-25,230.0,324.899623,150.661489,554.880869
2023-07-02,236.0,309.697643,164.917636,548.461492


Load the random forest predictions:

In [5]:
def load_ml(city, label,  ini_date = '2021-12-26', end_date= '2023-07-02', doenca = 'dengue'):

    data_ml = pickle.load(open(f'./models/gbt/predictions/rf_{city}_{doenca}_{label}_predictions.pkl', 'rb'))
    
    df_ml = pd.DataFrame()

    df_ml['dates'] = data_ml['dates']
    df_ml['target'] = data_ml['target']
    df_ml['preds'] = data_ml['preds']
    df_ml['lb'] = data_ml['preds25']
    df_ml['ub'] = data_ml['preds975']

    df_ml.dates = pd.to_datetime(df_ml.dates)

    df_ml = df_ml.loc[(df_ml.dates >= ini_date) & (df_ml.dates <= end_date)]
    
    df_ml.set_index('dates', inplace = True)
    
    return df_ml


In [8]:
def join_preds(city):
    
    df_ml_single = load_ml(city, label = 'single')
    
    df_nn_single = load_nn(city, label = 'single')

    df_single = df_ml_single.join(df_nn_single, how = 'outer', rsuffix='_nn')
    
    df_ml_cluster = load_ml(city, label = 'cluster')
    
    df_nn_cluster = load_nn(city, label = 'cluster')

    df_cluster = df_ml_cluster.join(df_nn_cluster, how = 'outer', rsuffix='_nn')
    
    df_end = df_single.join(df_cluster, how = 'outer', rsuffix='_cluster')
    
    return df_end
    

In [36]:
df_e = join_preds(2304400)
df_e = df_e.reset_index()
df_e.head()

Unnamed: 0,dates,target,preds,lb,ub,target_nn,preds_nn,lb_nn,ub_nn,target_cluster,preds_cluster,lb_cluster,ub_cluster,target_nn_cluster,preds_nn_cluster,lb_nn_cluster,ub_nn_cluster
0,2021-12-26,54.0,224.03,0.0,657.151984,54.0,273.031378,119.709463,455.103028,54.0,174.71,0.0,483.73042,54.0,212.753681,64.833293,331.897705
1,2022-01-02,212.0,311.15,0.0,926.336532,212.0,388.237005,211.350268,684.223565,212.0,484.22,56.638172,911.801828,212.0,232.746914,43.723487,365.530858
2,2022-01-09,236.0,989.63,0.0,2139.728589,236.0,591.635949,269.032851,959.501081,236.0,711.27,334.355797,1088.184203,236.0,270.097937,90.195155,404.365923
3,2022-01-16,174.0,1550.55,104.709219,2996.390781,174.0,409.682168,214.666462,694.70395,174.0,574.15,206.383236,941.916764,174.0,270.456914,97.824572,384.178081
4,2022-01-23,115.0,368.89,0.0,2020.111839,115.0,307.878775,156.810339,486.818191,115.0,374.78,0.0,859.012449,115.0,252.741518,121.959201,394.65667


In [43]:
alt.Chart(df_e).mark_line().encode(
    x='dates:T',
    y='target:Q',
    color=alt.value('black'),
    #tooltip = alt.value('Data: ') + 'target:Q'
)

In [12]:
import altair as alt
from vega_datasets import data

cars = data.cars.url

brush = alt.selection_interval()

alt.Chart(cars).mark_point().encode(
    x='Horsepower:Q',
    y='Miles_per_Gallon:Q',
    color=alt.condition(brush, 'Origin:N', alt.value('lightgray'))
).add_params(
    brush
)

In [23]:
import altair as alt
from vega_datasets import data

source = data.stocks()

highlight = alt.selection_point(on='mouseover', fields=['symbol'], nearest=True)

base = alt.Chart(source).encode(
    x='date:T',
    y='price:Q',
    color='symbol:N'
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_params(
    highlight
).properties(
    width=600
)

lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3)), 
    #color=alt.condition(~highlight, alt.value("gray"), alt.value("#black")), 
)

points + lines

In [24]:
import altair as alt
import pandas as pd
import numpy as np

np.random.seed(0)

n_objects = 20
n_times = 50

# Create one (x, y) pair of metadata per object
locations = pd.DataFrame({
    'id': range(n_objects),
    'x': np.random.randn(n_objects),
    'y': np.random.randn(n_objects)
})

# Create a 50-element time-series for each object
timeseries = pd.DataFrame(np.random.randn(n_times, n_objects).cumsum(0),
                          columns=locations['id'],
                          index=pd.RangeIndex(0, n_times, name='time'))

# Melt the wide-form timeseries into a long-form view
timeseries = timeseries.reset_index().melt('time')

# Merge the (x, y) metadata into the long-form view
timeseries['id'] = timeseries['id'].astype(int)  # make merge not complain
data = pd.merge(timeseries, locations, on='id')

# Data is prepared, now make a chart

selector = alt.selection_point(fields=['id'])

base = alt.Chart(data).properties(
    width=250,
    height=250
).add_params(selector)

points = base.mark_point(filled=True, size=200).encode(
    x='mean(x)',
    y='mean(y)',
    color=alt.condition(selector, 'id:O', alt.value('lightgray'), legend=None),
)

timeseries = base.mark_line().encode(
    x='time',
    y=alt.Y('value').scale(domain=(-15, 15)),
    color=alt.Color('id:O').legend(None)
).transform_filter(
    selector
)

points | timeseries

In [29]:
import altair as alt
from vega_datasets import data

source = data.stocks()

alt.Chart(source).mark_line().encode(
    x='date:T',
    y='price:Q',
    color='symbol:N',
    tooltip = 'price:Q'
)

In [30]:
df_end

NameError: name 'df_end' is not defined

In [None]:
alt.Chart(source).mark_line().encode(
    x='date:T',
    y='price:Q',
    color='symbol:N',
    tooltip = 'price:Q'
)

In [27]:
source

Unnamed: 0,symbol,date,price
0,MSFT,2000-01-01,39.81
1,MSFT,2000-02-01,36.35
2,MSFT,2000-03-01,43.22
3,MSFT,2000-04-01,28.37
4,MSFT,2000-05-01,25.45
...,...,...,...
555,AAPL,2009-11-01,199.91
556,AAPL,2009-12-01,210.73
557,AAPL,2010-01-01,192.06
558,AAPL,2010-02-01,204.62


In [44]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})

df.head()

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [46]:
pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5
3,a,C,2
4,b,C,4
5,c,C,6


In [50]:
df_e.columns[df_e.columns.str.startswith('preds')]

Index(['preds', 'preds_nn', 'preds_cluster', 'preds_nn_cluster'], dtype='object')

In [61]:
df_plot = pd.melt(df_e, id_vars=['dates'], value_vars = ['preds', 'preds_nn', 'preds_cluster', 'preds_nn_cluster'], 
       var_name = 'model', value_name = 'predictions')

df_plot['model'] = df_plot['model'].replace(
{'preds': 'RF', 'preds_nn': 'DL', 'preds_cluster': 'RF - cluster', 'preds_nn_cluster': 'DL - cluster'})

df_plot

Unnamed: 0,dates,model,predictions
0,2021-12-26,RF,224.030000
1,2022-01-02,RF,311.150000
2,2022-01-09,RF,989.630000
3,2022-01-16,RF,1550.550000
4,2022-01-23,RF,368.890000
...,...,...,...
315,2023-06-04,DL - cluster,304.602886
316,2023-06-11,DL - cluster,346.068766
317,2023-06-18,DL - cluster,343.693966
318,2023-06-25,DL - cluster,363.179827


In [66]:
base = alt.Chart(df_e).mark_line().encode(
    x='dates:T',
    y='target:Q',
    color=alt.value('black'),
    size = alt.value(3)
    #tooltip = alt.value('Data: ') + 'target:Q'
)


multilines = alt.Chart(df_plot).mark_line().encode(
    x='dates:T',
    y='predictions:Q',
    color='model:N',
    #tooltip = 'price:Q'
)

base + multilines

In [71]:
highlight = alt.selection_point( on='mouseover',
                          fields=['model'], nearest=True, empty="none")

alt.Chart(df_plot).mark_line().encode(
    x='dates:T',
    y='predictions:Q',
    color=alt.condition(highlight, 'model:N', alt.value("lightgray")),
    tooltip=["model:N", "predictions"]
).add_selection(
    highlight
)


In [120]:
highlight = alt.selection_point(on='mouseover', fields=['model'], nearest=True)

data = alt.Chart(df_e).mark_circle(size = 60).encode(
    x='dates:T',
    y='target:Q',
    color=alt.value('black'),
    #size = alt.value(3)
    tooltip = 'target:Q'
).properties(
    width=500
)


base = alt.Chart(df_plot).encode(
   x='dates:T',
    y='predictions:Q',
    color='model:N'
).add_params(
    highlight
).properties(
    width=500
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_params(
    highlight
).properties(
    width=500
)

lines = base.mark_line().encode(
    #size=alt.condition(~highlight, alt.value(1), alt.value(3))
    color=alt.condition(highlight, alt.Color('model:N'), alt.value('lightgray')),
    tooltip = ['model:N', 'predictions']
    
)


timeseries = base.mark_line().encode(
    color=alt.Color('model:N').legend(None)
).transform_filter(
    highlight
)


points + lines + data 

In [104]:
df_plot = pd.melt(df_e, id_vars=['dates'], value_vars = ['preds', 'preds_nn', 'preds_cluster', 'preds_nn_cluster'], 
       var_name = 'model', value_name = 'predictions')

df_plot['model'] = df_plot['model'].replace(
{'preds': 'RF', 'preds_nn': 'DL', 'preds_cluster': 'RF - cluster', 'preds_nn_cluster': 'DL - cluster'})

df_plot

Unnamed: 0,dates,model,predictions
0,2021-12-26,RF,224.030000
1,2022-01-02,RF,311.150000
2,2022-01-09,RF,989.630000
3,2022-01-16,RF,1550.550000
4,2022-01-23,RF,368.890000
...,...,...,...
315,2023-06-04,DL - cluster,304.602886
316,2023-06-11,DL - cluster,346.068766
317,2023-06-18,DL - cluster,343.693966
318,2023-06-25,DL - cluster,363.179827


In [106]:
df_e.columns[df_e.columns.str.startswith('lb')]

Index(['lb', 'lb_nn', 'lb_cluster', 'lb_nn_cluster'], dtype='object')

In [108]:
df_lower = pd.melt(df_e, id_vars=['dates'], value_vars = df_e.columns[df_e.columns.str.startswith('lb')], 
       var_name = 'model', value_name = 'lower')

df_lower['model'] = df_lower['model'].replace(
{'lb': 'RF', 'lb_nn': 'DL', 'lb_cluster': 'RF - cluster', 'lb_nn_cluster': 'DL - cluster'})

df_lower

Unnamed: 0,dates,model,lower
0,2021-12-26,RF,0.000000
1,2022-01-02,RF,0.000000
2,2022-01-09,RF,0.000000
3,2022-01-16,RF,104.709219
4,2022-01-23,RF,0.000000
...,...,...,...
315,2023-06-04,DL - cluster,158.264750
316,2023-06-11,DL - cluster,160.559812
317,2023-06-18,DL - cluster,191.552979
318,2023-06-25,DL - cluster,180.028784


In [110]:
df_upper = pd.melt(df_e, id_vars=['dates'], value_vars = df_e.columns[df_e.columns.str.startswith('ub')], 
       var_name = 'model', value_name = 'upper')

df_upper['model'] = df_upper['model'].replace(
{'ub': 'RF', 'ub_nn': 'DL', 'ub_cluster': 'RF - cluster', 'ub_nn_cluster': 'DL - cluster'})

df_upper

Unnamed: 0,dates,model,upper
0,2021-12-26,RF,657.151984
1,2022-01-02,RF,926.336532
2,2022-01-09,RF,2139.728589
3,2022-01-16,RF,2996.390781
4,2022-01-23,RF,2020.111839
...,...,...,...
315,2023-06-04,DL - cluster,426.872812
316,2023-06-11,DL - cluster,536.159720
317,2023-06-18,DL - cluster,543.737511
318,2023-06-25,DL - cluster,503.317822


In [114]:
df_plot_end = (df_plot.merge(df_lower, left_on = ['dates', 'model'], right_on = ['dates', 'model'])).merge(df_upper, left_on = ['dates', 'model'], right_on = ['dates', 'model'])

df_plot_end

Unnamed: 0,dates,model,predictions,lower,upper
0,2021-12-26,RF,224.030000,0.000000,657.151984
1,2022-01-02,RF,311.150000,0.000000,926.336532
2,2022-01-09,RF,989.630000,0.000000,2139.728589
3,2022-01-16,RF,1550.550000,104.709219,2996.390781
4,2022-01-23,RF,368.890000,0.000000,2020.111839
...,...,...,...,...,...
315,2023-06-04,DL - cluster,304.602886,158.264750,426.872812
316,2023-06-11,DL - cluster,346.068766,160.559812,536.159720
317,2023-06-18,DL - cluster,343.693966,191.552979,543.737511
318,2023-06-25,DL - cluster,363.179827,180.028784,503.317822


In [127]:
highlight = alt.selection_point(on='mouseover', fields=['model'], nearest=True)

data = alt.Chart(df_e).mark_circle(size = 60).encode(
    x='dates:T',
    y='target:Q',
    color=alt.value('black'),

    #size = alt.value(3)
    tooltip = 'target:Q'
).properties(
    width=400
)


base = alt.Chart(df_plot_end, title="Forecast of dengue new cases").encode(
   x=alt.X('dates:T').title('Dates'),
    y=alt.Y('predictions:Q').title('New cases'),
    color='model:N'
).add_params(
    highlight
).properties(
    width=400
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_params(
    highlight
).properties(
    width=400
)

lines = base.mark_line().encode(
    #size=alt.condition(~highlight, alt.value(1), alt.value(3))
    color=alt.condition(highlight, alt.Color('model:N'), alt.value('lightgray')),
    tooltip = ['model:N', 'predictions']
    
)


timeseries = base.mark_line().encode(
    color=alt.Color('model:N')
).transform_filter(
    highlight
)

timeseries_conf = base.mark_area(
    opacity=0.5
).encode(
    x='dates:T',
    y='lower:Q',
    y2='upper:Q'
).transform_filter(
    highlight
)


final = points + lines + data | timeseries + timeseries_conf + data

final

final.save('forecast_dengue.html')

In [147]:

highlight = alt.selection_point(on='mouseover', fields=['model'], nearest=True)

data = alt.Chart(df_e).mark_circle(size = 60).encode(
    x='dates:T',
    y='target:Q',
    color=alt.value('black'),

    #size = alt.value(3)
    tooltip = 'target:Q'
).properties(
    width=400
)


base = alt.Chart(df_plot_end, title="Forecast of dengue new cases").encode(
   x=alt.X('dates:T').title('Dates'),
    y=alt.Y('predictions:Q').title('New cases'),
    color='model:N'
).add_params(
    highlight
).properties(
    width=400
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_params(
    highlight
).properties(
    width=400
)

lines = base.mark_line().encode(
    #size=alt.condition(~highlight, alt.value(1), alt.value(3))
    color=alt.condition(highlight, alt.Color('model:N'), alt.value('lightgray')),
    tooltip = ['model:N', 'predictions']
    
)

texts = alt.Chart().mark_text(dy=-160, size=15).encode(
    text='model:N'
).transform_filter(
    highlight
)

timeseries = base.mark_line().encode(
    color=alt.Color('model:N')
).transform_filter(
    highlight
)

timeseries_conf = base.mark_area(
    opacity=0.5
).encode(
    x='dates:T',
    y='lower:Q',
    y2='upper:Q'
).transform_filter(
    highlight
)


final = points + lines + data | timeseries + timeseries_conf + data + texts

final

final.save('forecast_dengue.html')

In [146]:
highlight = alt.selection_point(on='mouseover', fields=['model'], nearest=True)

data = alt.Chart(df_e).mark_circle(size = 60).encode(
    x='dates:T',
    y='target:Q',
    color=alt.value('black'),

    #size = alt.value(3)
    tooltip = 'target:Q'
).properties(
    width=400
)


base = alt.Chart(df_plot_end, title="Forecast of dengue new cases").encode(
   x=alt.X('dates:T').title('Dates'),
    y=alt.Y('predictions:Q').title('New cases'),
    color='model:N'
).add_params(
    highlight
).properties(
    width=400
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_params(
    highlight
).properties(
    width=400
)

lines = base.mark_line().encode(
    #size=alt.condition(~highlight, alt.value(1), alt.value(3))
    color=alt.condition(highlight, alt.Color('model:N'), alt.value('lightgray')),
    tooltip = ['model:N', 'predictions']
    
)

texts = alt.Chart().mark_text(dy=-160, size=15).encode(
    text='model:N'
).transform_filter(
    highlight
)

timeseries = base.mark_line().encode(
    color=alt.Color('model:N')
).transform_filter(
    highlight
)

timeseries_conf = base.mark_area(
    opacity=0.5
).encode(
    x='dates:T',
    y='lower:Q',
    y2='upper:Q'
).transform_filter(
    highlight
)


final = points + lines + data + texts #+ timeseries_conf #+ data + texts

final
