In [1]:
from typing import Dict, List, Tuple

import arviz as az
import numpy as np
import pandas as pd
import xarray as xr
from bokeh.layouts import column
from bokeh.models import (
    Band,
    ColumnDataSource,
    DatetimeTickFormatter,
    GeoJSONDataSource,
    HoverTool,
    LabelSet,
    NumeralTickFormatter,
    Select,
    Span,
    Title,
)
from bokeh.palettes import cividis, inferno, viridis
from bokeh.plotting import figure, output_notebook, show
from scipy.special import expit as logistic

In [2]:
# output_notebook()

In [3]:
def standardize(series):
    """Standardize a pandas series"""
    return (series - series.mean()) / series.std()


def dates_to_idx(timelist):
    """Convert datetimes to numbers in reference to a given date. Useful for posterior predictions."""

    reference_time = timelist[0]
    t = (timelist - reference_time) / np.timedelta64(1, "M")

    return np.asarray(t)




def get_data_source(
    trace: az.InferenceData, post_pred_samples: xr.DataArray) -> pd.DataFrame:
    source_df = (
        post_pred_samples.stack(sample=("chain", "draw"))
        .to_pandas()
        .droplevel(0, axis=1)
    )
    
    source_df.columns = source_df.columns.astype(str)

    source_df["baseline"] = logistic(trace.predictions["baseline"]).mean().data
    source_df["baseline_lower"] = (
        logistic(az.hdi(trace.predictions)["baseline"]).sel(hdi="lower").data
    )
    source_df["baseline_upper"] = (
        logistic(az.hdi(trace.predictions)["baseline"]).sel(hdi="higher").data
    )

    source_df["median_app"] = post_pred_samples.median(dim=("chain", "draw")).data
    source_df["median_low"] = np.squeeze(
        az.hdi(post_pred_samples, hdi_prob=0.75).sel(hdi="lower").to_array().data
    )
    source_df["median_high"] = np.squeeze(
        az.hdi(post_pred_samples, hdi_prob=0.75).sel(hdi="higher").to_array().data
    )

    return source_df


def samples_subset(data_source: pd.DataFrame, frac: float = 0.1) -> Dict[str, List]:

    sub_source = data_source.filter(regex="\d", axis="columns").sample(
        frac=frac, replace=True, axis="columns"
    )

    dates = []
    draws = []
    for draw in sub_source.columns:
        dates.append(sub_source.index.values)
        draws.append(sub_source[draw].values)

    return {"dates": dates, "draws": draws}

In [17]:
pp_prop_atual = xr.open_dataset("../data/post_pred_desocupacao_positiva.nc")


pp_prop_atual.stack(sample=("chain", "draw")).to_pandas().droplevel(0, axis=1)


ValueError: cannot convert Datasets with 2 dimensions into pandas objects without changing the number of dimensions. Please use Dataset.to_dataframe() instead.

In [4]:

df = pd.read_csv("../data/complete_popularity_data.csv", index_col=0, parse_dates=True)

PREDICTION_COORDS = pd.read_csv(
    "../data/coords_desemprego_predicoes.csv", index_col=0, parse_dates=["temporal"]
)

raw_polls = pd.read_csv("../data/raw_polls.csv", index_col=0, parse_dates=True)


trace_predictions = az.from_netcdf("../data/trace_desemprego_predicoes.nc")

trace_raw_fundamental = az.from_netcdf("../data/trace_desemprego.nc")

pp_prop_atual = xr.open_dataset("../data/post_pred_desemprego_positiva.nc")
pp_prop_10 = xr.open_dataset("../data/post_pred_desemprego_positiva_10.nc")
pp_prop_19 = xr.open_dataset("../data/post_pred_desemprego_positiva_19.nc")


In [5]:
time = dates_to_idx(df.index)
time[:10]

array([0.        , 0.91993675, 1.93843816, 2.92408468, 3.94258609,
       4.92823261, 5.94673402, 6.96523543, 7.95088195, 8.96938335])

In [6]:
# change-points / Linzer model
# uncertainty in y (pollsters weights)
# economy: GRW

In [7]:
trace_raw_fundamental

In [8]:

source_annotations = ColumnDataSource(
    data=dict(
        dates=[
            pd.to_datetime("1995-01-01"),
            pd.to_datetime("1999-01-01"),
            pd.to_datetime("2003-01-01"),
            pd.to_datetime("2007-01-01"),
            pd.to_datetime("2011-01-01"),
            pd.to_datetime("2015-01-01"),
            pd.to_datetime("2019-01-01"),
            pd.to_datetime("2009-05-01"),
        ],
        ys=[0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.35],
        events=[
            "FHC I",
            "FHC II",
            "Lula I",
            "Lula II",
            "Dilma I",
            "Dilma/Temer",
            "Bolsonaro",
            "Média histórica",
        ],
    )
);


In [9]:
def make_plot(
    subtitle: str,
    palette,
    random_draws: Dict[str, List],
    data_source: pd.DataFrame,
    post_pred_samples: xr.Dataset,
):
    CDS = ColumnDataSource(data_source)

    p = figure(
        plot_width=1200,
        plot_height=425,
        sizing_mode="scale_both",
        background_fill_color="#f2f2f2",
        border_fill_color="#f2f2f2",
        x_axis_type="datetime",
        title="Evolução da popularidade dos presidentes",
        x_range=(
            pd.to_datetime("1994-01-01"),
            PREDICTION_COORDS["temporal"].iloc[-1] + pd.DateOffset(months=3),
        ),
        y_range=(0, 1),
        toolbar_location="above",
        tools="xpan, box_zoom, xwheel_zoom, reset, save",
    )
    p.xaxis[0].formatter = DatetimeTickFormatter(months="%b %Y", days="%d/%m")
    p.yaxis[0].formatter = NumeralTickFormatter(format="00%")
    p.add_layout(
        Title(
            text=f"1 trimestre de projeção, se o desemprego {subtitle}",
            align="center",
            text_font_style="italic",
            text_font_size="1.2em",
        ),
        "above",
    )
    p.title.text_font_size = "1.6em"
    p.title.align = "center"
    p.grid.grid_line_alpha = 0.5
    p.xaxis.axis_label = "Período"
    p.yaxis.axis_label = "% Popularidade"

    p.multi_line(
        xs=random_draws["dates"],
        ys=random_draws["draws"],
        color=palette[4],
        legend_label="Random samples",
    )
    p.patch(
        np.concatenate((data_source.index.values, data_source.index.values[::-1])),
        np.concatenate(
            (
                np.squeeze(az.hdi(post_pred_samples, hdi_prob=0.95).sel(hdi="lower").to_array()),
                np.squeeze(az.hdi(post_pred_samples, hdi_prob=0.95).sel(hdi="higher").to_array())[
                ::-1],
            )
        ),
        color=palette[3],
        line_alpha=0.4,
        fill_alpha=0.4,
        legend_label="95% HDI",
    )
    hdi = p.patch(
        np.concatenate((data_source.index.values, data_source.index.values[::-1])),
        np.concatenate(
            (
                np.squeeze(
                    az.hdi(post_pred_samples, hdi_prob=0.75).sel(hdi="lower").to_array()
                ),
                np.squeeze(
                    az.hdi(post_pred_samples, hdi_prob=0.75)
                    .sel(hdi="higher")
                    .to_array()
                )[::-1],
            )
        ),
        color=palette[2],
        line_alpha=0,
        fill_alpha=0.5,
        legend_label="75% HDI",
    )
    p.patch(
        np.concatenate((data_source.index.values, data_source.index.values[::-1])),
        np.concatenate(
            (
                np.squeeze(
                    az.hdi(post_pred_samples, hdi_prob=0.5).sel(hdi="lower").to_array()
                ),
                np.squeeze(
                    az.hdi(post_pred_samples, hdi_prob=0.5).sel(hdi="higher").to_array()
                )[::-1],
            )
        ),
        color=palette[1],
        line_alpha=0,
        fill_alpha=0.6,
        legend_label="50% HDI",
    )
    median_line = p.line(
        "temporal",
        "median_app",
        color=palette[0],
        line_width=2,
        legend_label="Median",
        source=CDS,
    )
    p.scatter(
        raw_polls.index.values,
        raw_polls.positiva.values,
        size=6,
        color=palette[5],
        legend_label="Pesquisas observadas",
        alpha=0.7,
    )

    labels = LabelSet(
        x="dates",
        y="ys",
        text="events",
        level="glyph",
        text_color="gray",
        text_font_style="italic",
        text_font_size="1em",
        text_align="center",
        source=source_annotations,
    )
    vline_0 = Span(
        location=source_annotations.data["dates"][0],
        dimension="height",
        line_color="gray",
        line_dash="dashed",
        line_width=1.5,
    )
    vline_1 = Span(
        location=source_annotations.data["dates"][1],
        dimension="height",
        line_color="gray",
        line_dash="dashed",
        line_width=1.5,
    )
    vline_2 = Span(
        location=source_annotations.data["dates"][2],
        dimension="height",
        line_color="gray",
        line_dash="dashed",
        line_width=1.5,
    )
    vline_3 = Span(
        location=source_annotations.data["dates"][3],
        dimension="height",
        line_color="gray",
        line_dash="dashed",
        line_width=1.5,
    )
    vline_4 = Span(
        location=source_annotations.data["dates"][4],
        dimension="height",
        line_color="gray",
        line_dash="dashed",
        line_width=1.5,
    )

    vline_5 = Span(
        location=source_annotations.data["dates"][5],
        dimension="height",
        line_color="gray",
        line_dash="dashed",
        line_width=1.5,
    )
    
    vline_6 = Span(
        location=source_annotations.data["dates"][6],
        dimension="height",
        line_color="gray",
        line_dash="dashed",
        line_width=1.5,
    )

    fifty_line = Span(
        location=0.5,
        dimension="width",
        line_color="gray",
        line_dash="dotted",
        line_width=1.5,
    )
    hist_band = Band(
        base="temporal",
        lower="baseline_lower",
        upper="baseline_upper",
        source=CDS,
        fill_color="gray",
        fill_alpha=0.2,
    )
    hist_avg_line = Span(
        location=CDS.data["baseline"][0],
        dimension="width",
        line_color="gray",
        line_dash="dashdot",
        line_width=2,
    )

    p.renderers.extend(
        [
            labels,
            vline_0,
            vline_1,
            vline_2,
            vline_3,
            vline_4,
            vline_5,
            vline_6,
            fifty_line,
            hist_band,
            hist_avg_line,
        ]
    )

    p.legend.click_policy = "hide"
    p.legend.location = "top_left"
    p.legend.orientation = "horizontal"
    p.legend.background_fill_color = "#f2f2f2"
    p.legend.background_fill_alpha = 0.6

    # Add the HoverTool to the figure
    TOOLTIPS = [
        ("Mediana", "@median_app{00%} e @temporal{%b %Y}"),
        ("75% chance entre", "@median_low{00%} e @median_high{00%}"),
        ("Média histórica entre", "@baseline_lower{00%} e @baseline_upper{00%}"),
    ]
    p.add_tools(
        HoverTool(
            tooltips=TOOLTIPS,
            formatters={"@temporal": "datetime"},
            mode="vline",
            renderers=[median_line],
        )
    )

    return p

In [10]:
source_df1 = get_data_source(trace_raw_fundamental, pp_prop_atual["post_pred_desemprego_positiva"])
source_df2 = get_data_source(trace_raw_fundamental, pp_prop_10["post_pred_desemprego_positiva_10"])
source_df3 = get_data_source(trace_raw_fundamental, pp_prop_19["post_pred_desemprego_positiva_19"])

KeyError: 'post_pred_desemprego_positiva'

In [None]:
trace_raw_fundamental

In [None]:
random_draws1 = samples_subset(source_df1)
random_draws2 = samples_subset(source_df2)
random_draws3 = samples_subset(source_df3)

In [None]:
p1 = make_plot(
    subtitle=f"stays at {df.desocupacao.iloc[-1]}%",
    palette=cividis(6),
    random_draws=random_draws1,
    data_source=source_df1,
    post_pred_samples=pp_prop_atual,
)
p2 = make_plot(
    subtitle="cair para 10%",
    palette=viridis(6),
    random_draws=random_draws2,
    data_source=source_df2,
    post_pred_samples=pp_prop_10,
)
p3 = make_plot(
    subtitle="aumentar para 18%",
    palette=inferno(6),
    random_draws=random_draws3,
    data_source=source_df3,
    post_pred_samples=pp_prop_18,
)

p2.title.text = ''
p3.title.text = ''
p2.x_range = p1.x_range
p3.x_range = p1.x_range

show(column(p1, p2, p3))

In [None]:
%load_ext watermark
%watermark -a @dmarcelinobr -n -u -v -iv