In [1]:
import pickle
import pandas as pd
import altair as alt
from datetime import datetime

In [2]:
def load_nn(city, label, ini_date = '2021-12-26', end_date= '2023-07-02', doenca = 'dengue'):
    
    data_nn = pickle.load(open(f'./models/neuralnetworks/predictions/lstm_{city}_{doenca}_{label}.pkl', 'rb'))
    
    
    df_nn = pd.DataFrame()

    ini_index = data_nn['indice'].index(datetime.strptime(ini_date, '%Y-%m-%d'))
    end_index = data_nn['indice'].index(datetime.strptime(end_date, '%Y-%m-%d')) + 1 

    df_nn['dates'] = data_nn['indice'][ini_index:end_index]
    df_nn['target'] = data_nn['target'][ini_index - 7: end_index - 7, -1] * data_nn['factor']
    df_nn['preds']  = (data_nn['pred'].iloc[ini_index - 7: end_index - 7,-1] * data_nn['factor']).values
    df_nn['lb']  = (data_nn['lb'].iloc[ini_index - 7: end_index - 7,-1] * data_nn['factor']).values
    df_nn['ub']  = (data_nn['ub'].iloc[ini_index - 7: end_index - 7,-1] * data_nn['factor']).values
    
    df_nn.set_index('dates', inplace = True)
    
    df_nn.index = pd.to_datetime(df_nn.index)

    return df_nn 

In [3]:
def load_ml(city, label,  ini_date = '2021-12-26', end_date= '2023-07-02', doenca = 'dengue'):

    data_ml = pickle.load(open(f'./models/gbt/predictions/rf_{city}_{doenca}_{label}_predictions.pkl', 'rb'))
    
    df_ml = pd.DataFrame()

    df_ml['dates'] = data_ml['dates']
    df_ml['target'] = data_ml['target']
    df_ml['preds'] = data_ml['preds']
    df_ml['lb'] = data_ml['preds25']
    df_ml['ub'] = data_ml['preds975']

    df_ml.dates = pd.to_datetime(df_ml.dates)

    df_ml = df_ml.loc[(df_ml.dates >= ini_date) & (df_ml.dates <= end_date)]
    
    df_ml.set_index('dates', inplace = True)
    
    return df_ml


In [4]:
def join_preds(city):
    
    df_ml_single = load_ml(city, label = 'single')
    
    df_nn_single = load_nn(city, label = 'single')

    df_single = df_ml_single.join(df_nn_single, how = 'outer', rsuffix='_nn')
    
    df_ml_cluster = load_ml(city, label = 'cluster')
    
    df_nn_cluster = load_nn(city, label = 'cluster')

    df_cluster = df_ml_cluster.join(df_nn_cluster, how = 'outer', rsuffix='_nn')
    
    df_end = df_single.join(df_cluster, how = 'outer', rsuffix='_cluster')
    
    return df_end

In [5]:
df_e = join_preds(2304400)
df_e = df_e.reset_index()
df_e.head()

Unnamed: 0,dates,target,preds,lb,ub,target_nn,preds_nn,lb_nn,ub_nn,target_cluster,preds_cluster,lb_cluster,ub_cluster,target_nn_cluster,preds_nn_cluster,lb_nn_cluster,ub_nn_cluster
0,2021-12-26,54.0,224.03,0.0,657.151984,54.0,273.031378,119.709463,455.103028,54.0,174.71,0.0,483.73042,54.0,212.753681,64.833293,331.897705
1,2022-01-02,212.0,311.15,0.0,926.336532,212.0,388.237005,211.350268,684.223565,212.0,484.22,56.638172,911.801828,212.0,232.746914,43.723487,365.530858
2,2022-01-09,236.0,989.63,0.0,2139.728589,236.0,591.635949,269.032851,959.501081,236.0,711.27,334.355797,1088.184203,236.0,270.097937,90.195155,404.365923
3,2022-01-16,174.0,1550.55,104.709219,2996.390781,174.0,409.682168,214.666462,694.70395,174.0,574.15,206.383236,941.916764,174.0,270.456914,97.824572,384.178081
4,2022-01-23,115.0,368.89,0.0,2020.111839,115.0,307.878775,156.810339,486.818191,115.0,374.78,0.0,859.012449,115.0,252.741518,121.959201,394.65667


In [6]:
df_plot = pd.melt(df_e, id_vars=['dates'], value_vars = ['preds', 'preds_nn', 'preds_cluster', 'preds_nn_cluster'], 
       var_name = 'model', value_name = 'predictions')

df_plot['model'] = df_plot['model'].replace(
{'preds': 'RF', 'preds_nn': 'DL', 'preds_cluster': 'RF - cluster', 'preds_nn_cluster': 'DL - cluster'})


In [7]:
df_lower = pd.melt(df_e, id_vars=['dates'], value_vars = df_e.columns[df_e.columns.str.startswith('lb')], 
       var_name = 'model', value_name = 'lower')

df_lower['model'] = df_lower['model'].replace(
{'lb': 'RF', 'lb_nn': 'DL', 'lb_cluster': 'RF - cluster', 'lb_nn_cluster': 'DL - cluster'})

In [8]:
df_upper = pd.melt(df_e, id_vars=['dates'], value_vars = df_e.columns[df_e.columns.str.startswith('ub')], 
       var_name = 'model', value_name = 'upper')

df_upper['model'] = df_upper['model'].replace(
{'ub': 'RF', 'ub_nn': 'DL', 'ub_cluster': 'RF - cluster', 'ub_nn_cluster': 'DL - cluster'})

In [9]:
df_for = (df_plot.merge(df_lower, left_on = ['dates', 'model'], right_on = ['dates', 'model'])).merge(df_upper, left_on = ['dates', 'model'], right_on = ['dates', 'model'])

df_for

Unnamed: 0,dates,model,predictions,lower,upper
0,2021-12-26,RF,224.030000,0.000000,657.151984
1,2022-01-02,RF,311.150000,0.000000,926.336532
2,2022-01-09,RF,989.630000,0.000000,2139.728589
3,2022-01-16,RF,1550.550000,104.709219,2996.390781
4,2022-01-23,RF,368.890000,0.000000,2020.111839
...,...,...,...,...,...
315,2023-06-04,DL - cluster,304.602886,158.264750,426.872812
316,2023-06-11,DL - cluster,346.068766,160.559812,536.159720
317,2023-06-18,DL - cluster,343.693966,191.552979,543.737511
318,2023-06-25,DL - cluster,363.179827,180.028784,503.317822


In [10]:
df = df_e[['dates', 'target']]
df['legend'] = 'Data'
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['legend'] = 'Data'


Unnamed: 0,dates,target,legend
0,2021-12-26,54.0,Data
1,2022-01-02,212.0,Data
2,2022-01-09,236.0,Data
3,2022-01-16,174.0,Data
4,2022-01-23,115.0,Data


In [12]:
# here is loaded the element that allows the selection by the mouse
highlight = alt.selection_point(on='mouseover', value = 'DL', fields=['model'], nearest=True)

width = 375 # width of the plots

# here is loaded the data points (black)
data = alt.Chart(df).mark_circle(size = 60).encode(
    x='dates:T',
    y='target:Q',
    color=alt.value('black'),
    opacity=alt.Opacity('legend', legend=alt.Legend(title=None)),

    #size = alt.value(3)
    tooltip = 'target:Q'
).properties(
    width=width
)


# here is created the base element for the time series 
base = alt.Chart(df_for, title="Forecast of dengue new cases").encode(
   x=alt.X('dates:T').title('Date'),
    y=alt.Y('predictions:Q').title('New cases'),
    color='model:N'
).add_params(
    highlight
).properties(
    width=width
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_params(
    highlight
)


# here we create the multine plot and use the alt.condition to highlight only one curve
lines = base.mark_line().encode(
    #size=alt.condition(~highlight, alt.value(1), alt.value(3))
    color=alt.condition(highlight, alt.Color('model:N'), alt.value('lightgray')),
    tooltip = ['model:N', 'predictions']
    
)

# here we define the plot of the right figure
timeseries = base.mark_line().encode(
    color=alt.Color('model:N')
).transform_filter(
    highlight # this function transform filter will just filter the element 
    #in hightlight from the column model N of the df_for (defined in the base element)
)

# here we create the area that represent the confidence interval of the predicitions
timeseries_conf = base.mark_area(
    opacity=0.5
).encode(
    x='dates:T',
    y='lower:Q',
    y2='upper:Q'
).transform_filter(
    highlight
)

# here we concatenate the layers, the + put one layer above the other
# the | put them syde by syde (as columns), and & put them side by side as lines
final = points + lines + data | timeseries + timeseries_conf + data

final

In [13]:
df_for

Unnamed: 0,dates,model,predictions,lower,upper
0,2021-12-26,RF,224.030000,0.000000,657.151984
1,2022-01-02,RF,311.150000,0.000000,926.336532
2,2022-01-09,RF,989.630000,0.000000,2139.728589
3,2022-01-16,RF,1550.550000,104.709219,2996.390781
4,2022-01-23,RF,368.890000,0.000000,2020.111839
...,...,...,...,...,...
315,2023-06-04,DL - cluster,304.602886,158.264750,426.872812
316,2023-06-11,DL - cluster,346.068766,160.559812,536.159720
317,2023-06-18,DL - cluster,343.693966,191.552979,543.737511
318,2023-06-25,DL - cluster,363.179827,180.028784,503.317822


In [35]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle

In [39]:
records = []

for model in ['RF', 'RF - cluster', 'DL', 'DL - cluster']:
    
    records.append([model, mse(df.target.values, df_for.loc[df_for.model == model]["predictions"].values ), 'MSE'])
    records.append([model, mse(df.target.values, df_for.loc[df_for.model == model]["predictions"].values, squared = False ), 'RMSE'])
    records.append([model, msle(df.target.values, df_for.loc[df_for.model == model]["predictions"].values ), 'MSLE'])
    records.append([model, mae(df.target.values, df_for.loc[df_for.model == model]["predictions"].values ), 'MAE'])
    

df_erro = pd.DataFrame(records
, 
    columns=['model', 'erro', 'metric'])


In [41]:
df_erro.head()

Unnamed: 0,model,erro,metric
0,RF,228670.925613,MSE
1,RF,478.195489,RMSE
2,RF,0.488052,MSLE
3,RF,319.6305,MAE
4,RF - cluster,153889.100632,MSE


In [43]:
input_dropdown = alt.binding_select(options=['MAE','MSE', 'RMSE', 'MSLE'], name='Error metric: ')
selection = alt.selection_point(fields=['metric'], bind=input_dropdown)


alt.Chart(df_erro).mark_bar().encode(
    x=alt.X('erro:Q').title('Erro'),
    y=alt.Y('model:N').title('Model'),
    color='model:N'
).add_params(
    selection
).transform_filter(
    selection
).properties(
    width=300,
    height = 150
)

