# Forecastability

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gogati/Modern-Time-Series-Forecasting-with-Python-2E/blob/main/notebooks/Chapter04/03-Forecastability.ipynb)

 

In [1]:
%cd ../..

c:\Users\tacke\OneDrive\Documents\GitHub\Modern-Time-Series-Forecasting-with-Python-2E-1


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
import plotly.io as pio
pio.templates.default = "plotly_white"
import pandas as pd
from pathlib import Path
from tqdm.autonotebook import tqdm
import statsmodels.api as sm
import warnings


from statsforecast.core import StatsForecast
from utilsforecast.evaluation import evaluate
from statsforecast.models import (
    Naive,
    SeasonalNaive,
    SimpleExponentialSmoothing,
    AutoETS,
    AutoTheta,
    Theta,
    OptimizedTheta,

)
from datasetsforecast.losses import *
#from darts.models import Theta
#from darts.utils.utils import ModelMode, SeasonalityMode
from IPython.display import display, HTML
# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()
pd.set_option('display.max_rows', None)


  from tqdm.autonotebook import tqdm
  "ds": pd.date_range(start="1949-01-01", periods=len(AirPassengers), freq="M"),


In [3]:
# this makes it so that the outputs of the predict methods have the id as a column 
# instead of as the index
if 'NIXTLA_ID_AS_COL' in os.environ:
    del os.environ['NIXTLA_ID_AS_COL']
os.environ['NIXTLA_ID_AS_COL'] = '1'

In [4]:
os.makedirs("imgs/chapter_4", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [5]:
def format_plot(fig, legends = None, xlabel="Time", ylabel="Value"):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t:  t.update(name = next(names)))
    fig.update_layout(
            autosize=False,
            width=900,
            height=500,
            title={
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            titlefont={
                "size": 20
            },
            legend_title = None,
            yaxis=dict(
                title_text=ylabel,
                titlefont=dict(size=12),
            ),
            xaxis=dict(
                title_text=xlabel,
                titlefont=dict(size=12),
            )
        )
    return fig

# Reading and Selecting Households

In [6]:
try:
    lclid_acorn_map = pd.read_pickle("data/london_smart_meters/preprocessed/london_smart_meters_lclid_acorn_map.pkl")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02
    </div>
    """))

In [7]:
affluent_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Affluent", ["LCLid",'file']]
adversity_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Adversity", ["LCLid",'file']]
comfortable_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Comfortable", ["LCLid",'file']]

Let's take a subset of the data because if we take everything, it will hit your RAM. Depending on how much RAM you have, we can choose larger data. But to maintain the variety in the dataset, we will do stratified sampling based on Acorn classifications. Here are a few guidelines:

* <= 50 households for 4GB RAM
* 50 - 100 households for 8GB RAM
* 100-150 households for 16GB RAM
* 250 households for 32GB RAM

Let's sample 150 households now, but feel free to reduce of increase as per your hardware constraints

150 households means 50 each from the three Acorn Groups - Affluent, Comfortable, Adversity (we are ignoring the households with unknown ACORN groups)

In [8]:
size = 50
selected_households = pd.concat(
    [
        affluent_households.sample(size, random_state=76),
        comfortable_households.sample(size, random_state=76),
        adversity_households.sample(size, random_state=76),
    ]
)
selected_households['block']=selected_households.file.str.split("_", expand=True).iloc[:,1].astype(int)

In [9]:
# extracting the paths to the different blocks and extracting the starting and ending blocks
path_blocks = [
    (p, *list(map(int, p.name.split("_")[5].split(".")[0].split("-"))))
    for p in Path("data/london_smart_meters/preprocessed").glob(
        "london_smart_meters_merged_block*"
    )
]

In [10]:
household_df_l = []
for path, start_b, end_b in tqdm(path_blocks):
    block_df = pd.read_parquet(path)
    selected_households['block'].between
    mask = selected_households['block'].between(start_b, end_b)
    lclids = selected_households.loc[mask, "LCLid"]
    household_df_l.append(block_df.loc[block_df.LCLid.isin(lclids)])

  0%|          | 0/14 [00:00<?, ?it/s]

In [11]:
block_df = pd.concat(household_df_l)
del household_df_l
block_df.head()

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
57,MAC000768,2012-04-21,30min,"[0.8440000000000001, 0.265, 0.262, 0.233999999...",32544,Std,ACORN-A,Affluent,block_1,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[251, 251, 251, 251, 246, 246, 242, 242, 244, ...","[6.42, 6.42, 6.2, 6.2, 5.68, 5.68, 5.16, 5.16,...","[3.54, 3.54, 3.61, 3.61, 3.52, 3.52, 3.11, 3.1...","[994.96, 994.96, 994.98, 994.98, 994.82, 994.8...","[3.79, 3.79, 3.67, 3.67, 3.15, 3.15, 2.61, 2.6...","[3.64, 3.64, 3.42, 3.42, 3.25, 3.25, 3.13, 3.1...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, par...","[0.82, 0.82, 0.83, 0.83, 0.86, 0.86, 0.87, 0.8...","[Partly Cloudy, Partly Cloudy, Partly Cloudy, ..."
63,MAC000948,2012-05-02,30min,"[0.008, 0.009, 0.008, 0.008, 0.008, 0.009, 0.0...",32016,Std,ACORN-A,Affluent,block_1,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[351, 351, 0, 0, 0, 0, 351, 351, 348, 348, 3, ...","[11.81, 11.81, 11.12, 11.12, 11.2, 11.2, 11.18...","[10.47, 10.47, 10.15, 10.15, 9.89, 9.89, 9.29,...","[1021.42, 1021.42, 1021.44, 1021.44, 1021.33, ...","[11.81, 11.81, 11.12, 11.12, 11.2, 11.2, 11.18...","[2.53, 2.53, 2.41, 2.41, 2.06, 2.06, 2.98, 2.9...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, par...","[0.91, 0.91, 0.94, 0.94, 0.92, 0.92, 0.88, 0.8...","[Mostly Cloudy, Mostly Cloudy, Mostly Cloudy, ..."
2827,MAC003299,2012-09-25,30min,"[0.254, 0.201, 0.183, 0.2189999999999999, 0.18...",25008,Std,ACORN-C,Affluent,block_5,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[220, 220, 218, 218, 215, 215, 211, 211, 207, ...","[10.93, 10.93, 10.81, 10.81, 10.27, 10.27, 10....","[7.76, 7.76, 8.07, 8.07, 8.04, 8.04, 7.62, 7.6...","[989.26, 989.26, 989.27, 989.27, 989.0, 989.0,...","[10.93, 10.93, 10.81, 10.81, 10.27, 10.27, 10....","[4.9, 4.9, 4.98, 4.98, 4.45, 4.45, 4.51, 4.51,...","[rain, rain, rain, rain, rain, rain, rain, rai...","[clear-night, clear-night, clear-night, clear-...","[0.81, 0.81, 0.83, 0.83, 0.86, 0.86, 0.84, 0.8...","[Clear, Clear, Clear, Clear, Clear, Clear, Cle..."
3389,MAC003157,2012-07-15,30min,"[0.181, 0.126, 0.13, 0.134, 0.18, 0.179, 0.118...",28464,ToU,ACORN-C,Affluent,block_6,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[250, 250, 259, 259, 267, 267, 284, 284, 286, ...","[12.72, 12.72, 12.72, 12.72, 12.98, 12.98, 12....","[12.01, 12.01, 12.0, 12.0, 12.32, 12.32, 11.83...","[1011.2, 1011.2, 1011.17, 1011.17, 1011.23, 10...","[12.72, 12.72, 12.72, 12.72, 12.98, 12.98, 12....","[1.73, 1.73, 2.15, 2.15, 2.31, 2.31, 2.28, 2.2...","[rain, rain, rain, rain, rain, rain, rain, rai...","[clear-night, clear-night, partly-cloudy-night...","[0.95, 0.95, 0.95, 0.95, 0.96, 0.96, 0.95, 0.9...","[Clear, Clear, Partly Cloudy, Partly Cloudy, P..."
3916,MAC000193,2012-01-01,30min,"[0.368, 0.386, 0.17, 0.021, 0.038, 0.038, 0.02...",37872,ToU,ACORN-D,Affluent,block_7,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[229, 229, 238, 238, 229, 229, 231, 231, 227, ...","[12.12, 12.12, 12.59, 12.59, 12.45, 12.45, 12....","[10.97, 10.97, 11.02, 11.02, 11.04, 11.04, 10....","[1008.1, 1008.1, 1007.88, 1007.88, 1007.95, 10...","[12.12, 12.12, 12.59, 12.59, 12.45, 12.45, 12....","[5.9, 5.9, 6.06, 6.06, 5.31, 5.31, 4.68, 4.68,...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, clo...","[0.93, 0.93, 0.9, 0.9, 0.91, 0.91, 0.93, 0.93,...","[Mostly Cloudy, Mostly Cloudy, Overcast, Overc..."


## Fill in missing values

Let's fill in missing values

In [12]:
from src.imputation.interpolation import SeasonalInterpolation

In [13]:
block_df.energy_consumption = block_df.energy_consumption.progress_apply(lambda x: SeasonalInterpolation(seasonal_period=48*7).fit_transform(x.reshape(-1,1)).squeeze())

  0%|          | 0/150 [00:00<?, ?it/s]

# Deseasonalize and Detrend

In [14]:
from src.decomposition.seasonal import MultiSeasonalDecomposition

In [15]:
def make_stationary(row):
#     print(row)
    # Order of row: LCLid, timestamp, frequency, energy_consumption
    ts = row[3]
    dates = pd.date_range(start=row[1], freq=row[2], periods=len(ts))
    stl = MultiSeasonalDecomposition(seasonal_model="fourier",seasonality_periods=["day_of_year", "day_of_week", "hour"], model = "additive", n_fourier_terms=10)
    res = stl.fit(pd.Series(ts, index=dates))
    return res.resid.values# + res.trend.values

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning) 
    block_df['residuals'] =[make_stationary(r) for r in tqdm(zip(*block_df.to_dict("list").values()), total=len(block_df))]

  0%|          | 0/150 [00:00<?, ?it/s]

# Forecastability Metrics

## Coefficient of Variation

In [16]:
from src.forecastability.cov import calc_cov

In [17]:
block_df["cov"] = block_df.progress_apply(lambda x: calc_cov(x['energy_consumption']), axis=1)


  0%|          | 0/150 [00:00<?, ?it/s]

In [18]:
fig = px.histogram(x=block_df["cov"], title="Distribution of Coefficient of Variation")
fig = format_plot(fig, xlabel="Coefficient of Variation", ylabel="")
# fig.write_image("imgs/chapter_4/cov.png")
fig.show()

## Residual Variability

In [19]:
from src.forecastability.cov import calc_norm_sd

In [20]:
block_df["residual_variability"] = block_df.progress_apply(lambda x: calc_norm_sd(x['residuals'],x['energy_consumption']), axis=1)

  0%|          | 0/150 [00:00<?, ?it/s]

In [21]:
fig = px.histogram(x=block_df["residual_variability"], title="Distribution of Residual Variability")
fig = format_plot(fig, xlabel="Residual Variability", ylabel="")
# fig.write_image("imgs/chapter_4/rv.png")
fig.show()

## Spectral Entropy

In [22]:
from src.forecastability.entropy import spectral_entropy

In [23]:
%%time
block_df["residual_spectral_entropy"] = block_df.residuals.progress_apply(spectral_entropy)

  0%|          | 0/150 [00:00<?, ?it/s]

CPU times: total: 1.11 s
Wall time: 4.35 s


In [24]:
%%time
block_df["spectral_entropy"] = block_df.energy_consumption.progress_apply(lambda x: spectral_entropy(x, transform_stationary=True))

  0%|          | 0/150 [00:00<?, ?it/s]

CPU times: total: 29.7 s
Wall time: 54.4 s


In [25]:
fig = px.histogram(x=block_df["spectral_entropy"], title="Distribution of Spectral Entropy")
fig = format_plot(fig, xlabel="Spectral Entropy", ylabel="")
# fig.write_image("imgs/chapter_4/spectral_entropy.png")
fig.show()

In [26]:
fig = px.histogram(x=block_df["residual_spectral_entropy"], title="Distribution of Residual Spectral Entropy")
fig = format_plot(fig, xlabel="Residual Spectral Entropy", ylabel="")
# fig.write_image("imgs/chapter_4/resid_spectral_entropy.png")
fig.show()

## Kaboudan Metric

In [27]:
#cd ../../

In [28]:
from src.forecastability.kaboudan import kaboudan_metric, modified_kaboudan_metric 

Theta method can take over an hour depending on the power of your machine.  Feel free to also try other models such as SeasonalNaive and SimpleExponentialSmoothing.

In [49]:
%%time
block_size = 5
freq = '30min'
#models = [SeasonalNaive(season_length=48*7)]
#models = [SimpleExponentialSmoothing(alpha = 0.4, alias = 'SimpleExponentialSmoothing')]
models = [Theta(season_length=48*7, decomposition_type='additive')]


  
block_df["kaboudan_metric"] = [kaboudan_metric(r[0], 
                                               model=models[0], 
                                               block_size=block_size,  
                                               backtesting_start=0.5, 
                                               n_folds=1,
                                               freq = freq) 
                                               for r in tqdm(zip(*block_df[["energy_consumption"]].to_dict("list").values()), total=len(block_df))]

block_df[['LCLid','kaboudan_metric']].head()

  0%|          | 0/150 [00:00<?, ?it/s]

CPU times: total: 52min 14s
Wall time: 1h 11min 7s


Unnamed: 0,LCLid,kaboudan_metric
57,MAC000768,0.454284
63,MAC000948,-6.683012
2827,MAC003299,0.393858
3389,MAC003157,0.374558
3916,MAC000193,-4.853901


In [50]:
%%time
block_df["modified_kaboudan_metric"] = [modified_kaboudan_metric(r[0], 
                                               model=models[0], 
                                               block_size=block_size,  
                                               backtesting_start=0.5, 
                                               n_folds=1,
                                               freq = freq) 
                                               for r in tqdm(zip(*block_df[["energy_consumption"]].to_dict("list").values()), total=len(block_df))]

block_df[['LCLid','modified_kaboudan_metric']].head()

  0%|          | 0/150 [00:00<?, ?it/s]

CPU times: total: 50min 44s
Wall time: 1h 18min 27s


Unnamed: 0,LCLid,modified_kaboudan_metric
57,MAC000768,0.043416
63,MAC000948,0.0
2827,MAC003299,0.217729
3389,MAC003157,0.123908
3916,MAC000193,0.0


In [51]:
fig = px.histogram(x=block_df["modified_kaboudan_metric"], title="Distribution of Modified Kaboudan Metric")
fig = format_plot(fig, xlabel="Modified Kaboudan Metric", ylabel="")
# fig.write_image("imgs/chapter_4/kaboudan_metric.png")
fig.show()

# Analysis

In [52]:
forecastability_df = block_df[["LCLid",'spectral_entropy', 'residual_spectral_entropy',
       'modified_kaboudan_metric', 'cov', "kaboudan_metric",
       'residual_variability']]
rename_dict = {
    "spectral_entropy": "Spectral Entropy",
    "residual_spectral_entropy" : "Residual Spectral Entropy",
    "modified_kaboudan_metric": "Modified Kaboudan Metric",
    "kaboudan_metric": "Kaboudan Metric",
    "cov": "Coefficient of Variation",
    "residual_variability": "Residual Variability",
    
}
forecastability_df.to_pickle(output/"forecastability_metrics.pkl")

In [53]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def biplot(item_metrics_df, features, title="Loading Plot"):
    X = item_metrics_df[features].dropna()
    scaler = StandardScaler()
    scaler.fit(X)
    X=scaler.transform(X)
    pca = PCA(n_components=2, whiten=True)
    components = pca.fit_transform(X)
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    xs = components[:,0]
    ys = components[:,1]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    xl = loadings[:,0]
    yl = loadings[:,1]
    scalex_loading = 1.0/(xl.max() - xl.min())
    scaley_loading = 1.0/(yl.max() - yl.min())
    fig = px.scatter(x=xs * scalex, y=ys * scaley, opacity=0, template="plotly_white")
    for i, feature in enumerate(features):
        fig.add_shape(
            type='line',
            x0=0, y0=0,
            x1=xl[i]* scalex_loading,
            y1=yl[i]* scaley_loading,
            line=dict(color="indigo",
                width=2,
                dash="dot")
        )
        fig.add_annotation(
            x=xl[i]* scalex_loading,
            y=yl[i]* scaley_loading,
            ax=0, ay=0,
            xanchor="center",
            yanchor="bottom",
            text= feature, #"<b>"+feature+"</b>",
            font=dict(
            family="Open Sans, sans serif",
            size=16,
            color="MediumPurple"
            ),
        )
    fig.update_layout(title_text=title, title_x=0.5)
    
    fig.show()

## Rank Correlation of Forecastability Metrics

Rank Correlation is when we calculate the rank of each household according to these metrics and then calculate a Spearman's Correlation on these ranks. Here we do it within the different metrics to find out how similar these metrics are.

In [54]:
def calc_rank(rank_df):
    for col in ['spectral_entropy', 'residual_spectral_entropy',
       'cov',"residual_variability"]:
        rank_df[col] = rank_df[col].rank(ascending=True)

    for col in ['modified_kaboudan_metric',"kaboudan_metric"]:
        rank_df[col] = rank_df[col].rank(ascending=False)

    return rank_df

item_rankings =calc_rank(forecastability_df.drop(columns="LCLid"))
item_rankings.rename(columns=rename_dict, inplace=True)
item_rankings.rename(index=rename_dict, inplace=True)

In [55]:
corr_df = item_rankings.corr(method='spearman')
# corr_df.style.background_gradient(cmap='coolwarm')

In [56]:
import plotly.figure_factory as ff

# mask = np.triu(np.ones_like(corr_df, dtype=bool))
# df_mask = corr_df.mask(mask).round(2)
df_mask = corr_df.round(2)

fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                  x=df_mask.columns.tolist(),
                                  y=df_mask.columns.tolist(),
                                  colorscale=px.colors.diverging.RdYlGn,
                                  hoverinfo="none", #Shows hoverinfo for null values
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")
# fig.update_traces(textfont_size=14)
fig.update_layout(
    title_text='Rank Correlation Plot - Forecastability Metrics', 
    title_x=0.5, 
    width=700, 
    height=700,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    yaxis_autorange='reversed',
    template='plotly_white'
)

# NaN values are not handled automatically and are displayed in the figure
# So we need to get rid of the text manually
for i in range(len(fig.layout.annotations)):
    fig.layout.annotations[i]['font']['size']=15
    if fig.layout.annotations[i].text == 'nan':
        fig.layout.annotations[i].text = ""

# fig.write_image("imgs/chapter_4/rank_correlation_forecastability.png")
fig.show()

We can see very high correlations between CoV and REsidual Variabilily because they are measuring the same thing, essentially. Residual Variability just fixes a few problems with the original CoV metric. Similarly we can see high correlations between Kaboudan and Modified Kaboudan metrics, which are also essentially the same metric. Spectral Entropy has relatively higher correlation betweem all the other metrics, the most with Residual Variability.

## Rank Correlation of Forecastability vs Forecast Metrics

We also have the metrics that we calculated using the baseline forecasts from notebook 02-Baseline Forecasts using darts.ipynb. We merge that information with the forecastability metrics and calculate a rank correlation to see which of tehse metrics really corelates with the baseline error metrics.

In [57]:
base_metric_df = pd.read_pickle(output/"baseline_test_metrics_df.pkl")

In [58]:
base_metric_df.head()

metric,LCLid,Algorithm,mae,mase,mse,rmse
0,MAC000061,AutoETS,0.058033,0.994184,0.006229,0.078926
1,MAC000061,TBATS,0.075398,1.291667,0.010697,0.103424
2,MAC000062,AutoETS,0.091553,1.02042,0.02873,0.1695
3,MAC000062,TBATS,0.075708,0.843818,0.027612,0.166167
4,MAC000066,AutoETS,0.044511,0.844105,0.004667,0.068317


In [59]:
forecastability_df = forecastability_df.merge(base_metric_df[base_metric_df.Algorithm=="AutoETS"], on="LCLid", how='inner')

In [60]:
forecastability_df.head()

Unnamed: 0,LCLid,spectral_entropy,residual_spectral_entropy,modified_kaboudan_metric,cov,kaboudan_metric,residual_variability,Algorithm,mae,mase,mse,rmse
0,MAC000768,0.80721,0.890163,0.043416,0.889308,0.454284,0.787155,AutoETS,0.155062,0.951542,0.050409,0.224519
1,MAC000948,0.824352,0.877598,0.0,0.788908,-6.683012,0.705883,AutoETS,0.125941,0.899982,0.050007,0.223623
2,MAC003299,0.738643,0.880901,0.217729,0.67712,0.393858,0.57176,AutoETS,0.131688,0.755936,0.041799,0.204449
3,MAC003157,0.655312,0.82691,0.123908,0.706605,0.374558,0.561631,AutoETS,0.156521,1.100816,0.05373,0.231797
4,MAC000193,0.72589,0.86085,0.0,1.382776,-4.853901,1.176164,AutoETS,0.20602,1.206805,0.100771,0.317445


In [61]:
def calc_rank(rank_df):
    for col in ['spectral_entropy', 'residual_spectral_entropy',
       'cov',"residual_variability"]:
        rank_df[col] = rank_df[col].rank(ascending=True)

    for col in ['modified_kaboudan_metric',"kaboudan_metric", "mae", "mse", "mase"]:
        rank_df[col] = rank_df[col].rank(ascending=False)
    
    # for col in ["Forecast Bias"]:
    #     rank_df[col] = np.abs(rank_df[col]).rank(ascending=False)

    return rank_df

item_rankings =calc_rank(forecastability_df.drop(columns=["LCLid",'Algorithm']))
item_rankings.rename(columns=rename_dict, inplace=True)
item_rankings.rename(index=rename_dict, inplace=True)
item_rankings.drop(columns=['Time Elapsed', "Model"], inplace=True)
corr_df = item_rankings.corr(method='spearman')

In [62]:
corr_df.head()

Unnamed: 0,Spectral Entropy,Residual Spectral Entropy,Modified Kaboudan Metric,Coefficient of Variation,Kaboudan Metric,Residual Variability,mae,mase,mse,rmse
Spectral Entropy,1.0,0.627187,0.419594,0.086278,0.356018,0.259583,0.164304,0.145427,0.099512,-0.099512
Residual Spectral Entropy,0.627187,1.0,0.186424,-0.042798,0.162142,0.040345,0.312681,0.333343,0.23064,-0.23064
Modified Kaboudan Metric,0.419594,0.186424,1.0,0.126712,0.634865,0.21319,-0.002456,-0.077446,-0.036175,0.036175
Coefficient of Variation,0.086278,-0.042798,0.126712,1.0,0.154903,0.972031,-0.257368,-0.292353,-0.303763,0.303763
Kaboudan Metric,0.356018,0.162142,0.634865,0.154903,1.0,0.207687,-0.085897,-0.152535,-0.12448,0.12448


In [63]:
import plotly.figure_factory as ff

mask = np.zeros_like(corr_df).astype(bool)
mask[:5,:] = True
df_mask = corr_df.mask(mask).round(2)

fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                  x=df_mask.columns.tolist(),
                                  y=df_mask.columns.tolist(),
                                  colorscale=px.colors.diverging.RdYlGn,
                                  hoverinfo="none", #Shows hoverinfo for null values
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")
# fig.update_traces(textfont_size=14)
fig.update_layout(
    title_text='Rank Correlation Plot - Forecastability vs Forecast Metrics', 
    title_x=0.5, 
    width=700, 
    height=700,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    # yaxis_autorange='reversed',
    xaxis_range=[-0.5,5.5],
    yaxis_range=[5.5, 9.5]
)

# NaN values are not handled automatically and are displayed in the figure
# So we need to get rid of the text manually
for i in range(len(fig.layout.annotations)):
    fig.layout.annotations[i]['font']['size']=15
    fig.layout.annotations[i]['font']['color']="#000000"
    if fig.layout.annotations[i].text == 'nan':
        fig.layout.annotations[i].text = ""
# fig.write_image("imgs/chapter_4/rank_correlation_forecastability_vs_metrics.png")
fig.show()

In [64]:
biplot(item_rankings, features=['Spectral Entropy',"Residual Spectral Entropy",
       'Modified Kaboudan Metric', 'Coefficient of Variation',"Kaboudan Metric",
       'Residual Variability', "mae", "mse", "mase"
       #,"Forecast Bias"
       ], title="<b>Loading Plot: Forecastability Metrics vs Error Metrics</b>")