In [31]:
# 06 Combined Error Metrics
## DM test

In [42]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import itertools
import statsmodels.api as sm
import os
from joblib import Parallel, delayed
import properscoring as ps
import warnings

os.environ["PYTHONWARNINGS"] = "ignore" # this change is only for this time's session

In [33]:
from epiweeks import Week, Year
from datetime import date
def create_epiweek(date):
    return Week.fromdate(date)
def create_epiweekplot(epiweek):
    epiweek = str(epiweek)
    return F'Y{epiweek[:4]}W{epiweek[4:]}'
def create_epiweek_fromstr(str):
    return Week.fromstring(str)

In [34]:
# def generate_error_metrics(dataset, target_var):
#     pred = dataset.copy()
#     model_list = list(pred.columns.values)
#     y = pred[[target_var]]
#     model_list.remove(target_var)

#     error_df = pd.DataFrame()
#     #print(model_list)

#     for model in model_list:
#         model_val = pred[[model]].dropna()
#         window_start = model_val.index[0]
#         window_end = model_val.index[-1]
#         y_val = y.loc[window_start:window_end].copy()

#         ## Diebold-Mariano against Naive
#         if model == 'naive':
#             dm_stat, pvalue = 0, 0
#         else:
#             dm_stat, pvalue = dm_test(y_val, naive_val, model_val)
#             if pvalue < 0.05:
#                 pvalue = 'R'
#             else:
#                 pvalue = 'A'

#         error_df.at[model, 'DM'], error_df.at[model, 'pval'] = dm_stat, pvalue

#     return error_df


In [35]:
from itertools import islice
from typing import Sequence, Callable, List, Tuple
from math import lgamma, fabs, isnan, nan, exp, log, log1p, sqrt


class InvalidParameterException(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class ZeroVarianceException(ArithmeticError):
    def __init__(self, message: str):
        super().__init__(message)


def autocovariance(X: Sequence[float], k: int, mean: float) -> float:
    """
    Returns the k-lagged autocovariance for the input iterable.
    """
    return sum((a - mean) * (b - mean) for a, b in zip(islice(X, k, None), X)) / len(X)


def log_beta(a: float, b: float) -> float:
    """
    Returns the natural logarithm of the beta function computed on
    arguments `a` and `b`.
    """
    return lgamma(a) + lgamma(b) - lgamma(a + b)


def evaluate_continuous_fraction(
    fa: Callable[[int, float], float],
    fb: Callable[[int, float], float],
    x: float,
    *,
    epsilon: float = 1e-10,
    maxiter: int = 10000,
    small: float = 1e-50
) -> float:
    """
    Evaluate a continuous fraction.
    """
    h_prev = fa(0, x)
    if fabs(h_prev < small):
        h_prev = small

    n: int = 1
    d_prev: float = 0.0
    c_prev: float = h_prev
    hn: float = h_prev

    while n < maxiter:
        a = fa(n, x)
        b = fb(n, x)

        dn = a + b * d_prev
        if fabs(dn) < small:
            dn = small

        cn = a + b / c_prev
        if fabs(cn) < small:
            cn = small

        dn = 1 / dn
        delta_n = cn * dn
        hn = h_prev * delta_n

        if fabs(delta_n - 1.0) < epsilon:
            break

        d_prev = dn
        c_prev = cn
        h_prev = hn

        n += 1

    return hn


def regularized_incomplete_beta(
    x: float, a: float, b: float, *, epsilon: float = 1e-10, maxiter: int = 10000
) -> float:
    if isnan(x) or isnan(a) or isnan(b) or x < 0 or x > 1 or a <= 0 or b <= 0:
        return nan

    if x > (a + 1) / (2 + b + a) and 1 - x <= (b + 1) / (2 + b + a):
        return 1 - regularized_incomplete_beta(
            1 - x, b, a, epsilon=epsilon, maxiter=maxiter
        )

    def fa(n: int, x: float) -> float:
        return 1.0

    def fb(n: int, x: float) -> float:
        if n % 2 == 0:
            m = n / 2.0
            return (m * (b - m) * x) / ((a + (2 * m) - 1) * (a + (2 * m)))

        m = (n - 1.0) / 2.0
        return -((a + m) * (a + b + m) * x) / ((a + (2 * m)) * (a + (2 * m) + 1.0))

    return exp(
        a * log(x) + b * log1p(-x) - log(a) - log_beta(a, b)
    ) / evaluate_continuous_fraction(fa, fb, x, epsilon=epsilon, maxiter=maxiter)


def dm_test(
    P1: Sequence[float],
    P2: Sequence[float],
    *,
    h: int = 1,
    one_sided: bool = False,
    harvey_correction: bool = True
) -> Tuple[float, float]:
    r"""
    Performs the Diebold-Mariano test using precomputed loss values.
    The null hypothesis is that the two forecasts (`P1`, `P2`) have the same accuracy.

    Parameters
    ----------
    P1: Sequence[float]
        First loss series.

    P2: Sequence[float]
        Second loss series.

    h: int
        The forecast horizon. Default is 1.

    one_sided: bool
        If set to true, returns the p-value for a one-sided test instead of a two-sided test. Default is false.

    harvey_correction: bool
        If set to true, uses a modified test statistic as per Harvey, Leybourne and Newbold (1997).

    Returns
    -------
    A tuple of two values. The first is the test statistic, the second is the p-value.
    """
    if not (len(P1) == len(P2)):
        raise InvalidParameterException(
            "Prediction series must have the same length."
        )

    if h <= 0:
        raise InvalidParameterException(
            "Invalid parameter for horizon length. Must be a positive integer."
        )

    n = len(P1)
    D = [l1 - l2 for l1, l2 in zip(P1, P2)]
    mean = sum(D) / n

    V_d = 0.0
    for i in range(h):
        cov = autocovariance(D, i, mean)
        if i != 0:
            cov *= 2
        V_d += cov

    V_d /= n

    if V_d == 0:
        raise ZeroVarianceException(
            "Variance of the DM statistic is zero. Maybe the loss series are identical?"
        )

    if harvey_correction:
        harvey_adj = sqrt((n + 1 - 2 * h + h * (h - 1) / n) / n)
        dmstat = harvey_adj * mean / sqrt(V_d)
    else:
        dmstat = mean / sqrt(V_d)

    pvalue = regularized_incomplete_beta(
        (n - 1) / ((n - 1) + dmstat ** 2), 0.5 * (n - 1), 0.5
    )

    if one_sided:
        if dmstat > 0:
            pvalue = pvalue
        else:
            pvalue = 1

    return dmstat.item(), pvalue

In [36]:
def crps(y_val, y_pred, model, target_var):
    np.random.seed(0)
    crps_df = pd.DataFrame()
    
    for epiweek in y_val.index:
        
        crps_df.at[epiweek, model] = ps.crps_ensemble(y_val.loc[epiweek, target_var], 
                                                      np.array(y_pred.loc[epiweek], dtype='float64'))
    
    return crps_df

In [37]:
def prepare_diebold_mariano(pred_models_path, pred_combis_path, target_var, model_1, model_2, model_list, combi_list):
    
    if model_1 in model_list:
        pred_model1_file = os.path.join(pred_models_path, f'{model_1}.csv')
    else:
        pred_model1_file = os.path.join(pred_combis_path, f'{model_1}.csv')
        
    
    if model_2 in model_list:
        pred_model2_file = os.path.join(pred_models_path, f'{model_2}.csv')
    else:
        pred_model2_file = os.path.join(pred_combis_path, f'{model_2}.csv')
    
#     # To get y_val, so use the model 'ar_pure'
#     ar_pure_file = os.path.join(pred_models_path, 'ar_pure.csv')
#     ar_pure_val = pd.read_csv(ar_pure_file, parse_dates = [0], dayfirst = True)
#     ar_pure_val['epiweek'] = ar_pure_val['epiweek'].apply(create_epiweek_fromstr)
#     ar_pure_val = ar_pure_val.set_index('epiweek')
#     y_val = ar_pure_val[[target_var]]
    
    if os.path.isfile(pred_model1_file) and os.path.isfile(pred_model2_file):
        model_1_val = pd.read_csv(pred_model1_file, parse_dates = [0], dayfirst = True)  
        model_1_val['epiweek'] = model_1_val['epiweek'].apply(create_epiweek_fromstr)
        model_1_val = model_1_val.set_index('epiweek')
        y_val_1 = model_1_val[[target_var]]
        model_1_val_crps = crps(y_val_1.copy(), model_1_val.iloc[:,1:].copy(), model_1, target_var)

        model_2_val = pd.read_csv(pred_model2_file, parse_dates = [0], dayfirst = True)  
        model_2_val['epiweek'] = model_2_val['epiweek'].apply(create_epiweek_fromstr)
        model_2_val = model_2_val.set_index('epiweek')
        y_val_2 = model_2_val[[target_var]]
        model_2_val_crps = crps(y_val_2.copy(), model_2_val.iloc[:,1:].copy(), model_2, target_var)
        


    return model_1_val_crps, model_2_val_crps


In [38]:
def evaluate_pvalue(pvalue):
    if pvalue < 0.05:
    #non-equivalent, i.e. we reject the null hypothesis that both models have equal predictive capability
    #non-equivalence in RED
        pvalue = -1
    else:
    #pvalue > 0.05
    #equivalent, i.e. we accept the null hypothesis that both models have equal predictive capability
    #not enough evidence to show that one model predictive better than the other
        pvalue = 1
    return pvalue

In [39]:
# def generate_diebold_mariano(dataset, target_var, step_name):
    
#     pred_models_path = os.path.join(pred_directory_path,step_name)
#     if os.path.isdir(pred_models_path):
#         for model_name in os.listdir(pred_models_path): # 'model_name' here includes the '.csv'
#             pred_file = os.path.join(pred_models_path, model_name)
#             model = model_name[0:-4]
#             if os.path.isfile(pred_file):
#                 y_pred = pd.read_csv(pred_file, parse_dates = [0], dayfirst = True)  
#                 y_pred['epiweek'] = y_pred['epiweek'].apply(create_epiweek_fromstr)
#                 y_pred = y_pred.set_index('epiweek') 
    
#     model_list = list(pred.columns.values)
#     y = pred[[target_var]]
#     model_list.remove(target_var)

#     diebold_mariano_dmstat_df = pd.DataFrame(index=model_list, columns=model_list)
#     diebold_mariano_pvalue_df = pd.DataFrame(index=model_list, columns=model_list)
    
#     for model_1 in model_list:
#         for model_2 in model_list:
#             if model_1 == model_2:
#                 dm_stat, pvalue = 0, 0
#             else:
#                 if pd.isna(diebold_mariano_pvalue_df.loc[model_2, model_1]):
#                     model_1_val, model_2_val, y_val = prepare_diebold_mariano(pred, target_var, model_1, model_2)
#                     dm_stat, pvalue = dm_test(y_val, model_1_val, model_2_val, one_sided=True)
#                     pvalue = evaluate_pvalue(pvalue)
#                 else:
#                     dm_stat, pvalue = 0, 0
#             diebold_mariano_dmstat_df.at[model_1, model_2], diebold_mariano_pvalue_df.at[model_1, model_2] = dm_stat, pvalue
#     return diebold_mariano_dmstat_df, diebold_mariano_pvalue_df

In [40]:
def generate_diebold_mariano(target_var, pred_directory, pred_combi_directory):
    pred_directory_path = os.path.join(target_var, pred_directory)
    pred_combi_directory_path = os.path.join(target_var, pred_combi_directory)
    
    for step_name in os.listdir(pred_directory_path):
        pred_models_path = os.path.join(pred_directory_path,step_name)
        pred_combis_path = os.path.join(pred_combi_directory_path,step_name)
        if os.path.isdir(pred_models_path):
            model_combi_list = []
            model_list = []
            combi_list = []
            for model_name in os.listdir(pred_models_path): # 'model_name' here includes the '.csv'
#                 pred_file = os.path.join(pred_models_path, model_name) 
                model = model_name[0:-4]
                model_combi_list.append(model)
                model_list.append(model)
            for combi_name in os.listdir(pred_combis_path):
#                 pred_file = os.path.join(pred_combis_path, combi_name)
                combi = combi_name[0:-4]
                model_combi_list.append(combi)
                combi_list.append(combi)
                
                
            diebold_mariano_dmstat_df = pd.DataFrame(index=model_combi_list, columns=model_combi_list)
            diebold_mariano_pvalue_df = pd.DataFrame(index=model_combi_list, columns=model_combi_list)

            for model_1 in model_combi_list:
                for model_2 in model_combi_list:
                    if model_1 == model_2:
                        dm_stat, pvalue = 0, 0
                    else:
                        if pd.isna(diebold_mariano_pvalue_df.loc[model_2, model_1]):
                            model_1_val_crps, model_2_val_crps = prepare_diebold_mariano(pred_models_path, pred_combis_path, target_var, model_1, model_2, model_list, combi_list)
                            # Filter both DataFrames to keep only common epiweeks
                            common_epiweeks = model_1_val_crps.index.intersection(model_2_val_crps.index)
                            model_1_val_crps = model_1_val_crps.loc[common_epiweeks]
                            model_2_val_crps = model_2_val_crps.loc[common_epiweeks]
                            
                            dm_stat, pvalue = dm_test(np.array(model_1_val_crps), np.array(model_2_val_crps), one_sided=True)
                            pvalue = evaluate_pvalue(pvalue)
                        else:
                            dm_stat, pvalue = 0, 0
                    diebold_mariano_dmstat_df.at[model_1, model_2], diebold_mariano_pvalue_df.at[model_1, model_2] = dm_stat, pvalue



            dmstat_path = os.path.join(target_var, 'dmstat')
            if not os.path.exists(dmstat_path):
                os.makedirs(dmstat_path)
            diebold_mariano_dmstat_df.to_csv(os.path.join(dmstat_path, f'{step_name}.csv'))

            pvalue_path = os.path.join(target_var, 'pvalue')
            if not os.path.exists(pvalue_path):
                os.makedirs(pvalue_path)
            diebold_mariano_pvalue_df.to_csv(os.path.join(pvalue_path, f'{step_name}.csv'))



In [43]:
def full_generate_diebold_mariano(target_variables_file, pred_directory, pred_combi_directory):
    target_variables = []
    with open(target_variables_file, 'r') as file:
        for line in file:
            # Remove linebreak which is the last character of the string
            target_variable = line[:-1]
            # Add item to the list
            target_variables.append(target_variable)
    print(target_variables)

    Parallel(n_jobs=-1, verbose=51)(delayed(generate_diebold_mariano)(target_var, 
                                                                    pred_directory, 
                                                                    pred_combi_directory) for target_var in target_variables)
    
full_generate_diebold_mariano('target_variables_new.txt', 'pred','combi_samples')

['Cardiovascular disease', 'Chronic respiratory disease', 'Factors influencing health status and contact with health services', 'Digestive disease', 'Endocrine disorders', 'Malignant neoplasms', 'Diabetes mellitus', 'Genitourinary disorders', 'Musculoskeletal disease', 'Infectious and Parasitic Diseases', 'Neurological and sense disorders', 'Oral Diseases', 'Other neoplasms', 'Respiratory Infection', 'Skin diseases']
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  6.0min remaining: 38.9min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:  6.0min remaining: 24.1min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  6.0min remaining: 16.6min
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:  6.0min remaining: 12.1min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:  6.1min remaining:  9.1min
[Parallel(n_jobs=-1)]: Done   7 out

### Test

In [25]:
sb = pd.read_csv('Cardiovascular disease/pred/L8_S1/naive.csv', parse_dates = [0], dayfirst = True)
sb['epiweek'] = sb['epiweek'].apply(create_epiweek_fromstr)
sb = sb.set_index('epiweek') 
sb

  sb = pd.read_csv('Cardiovascular disease/pred/L8_S1/naive.csv', parse_dates = [0], dayfirst = True)


Unnamed: 0_level_0,Cardiovascular disease,0,1,2,3,4,5,6,7,8,...,990,991,992,993,994,995,996,997,998,999
epiweek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201604,885.0,903.0,919.0,952.0,942.0,835.0,892.0,1024.0,923.0,956.0,...,889.0,872.0,922.0,974.0,917.0,903.0,880.0,932.0,921.0,923.0
201605,906.0,938.0,985.0,934.0,847.0,1014.0,908.0,967.0,896.0,891.0,...,978.0,926.0,967.0,927.0,928.0,986.0,966.0,916.0,919.0,999.0
201606,861.0,864.0,927.0,890.0,937.0,910.0,1015.0,854.0,906.0,889.0,...,884.0,961.0,899.0,849.0,899.0,858.0,862.0,905.0,919.0,886.0
201607,948.0,843.0,941.0,894.0,974.0,847.0,897.0,879.0,879.0,938.0,...,904.0,900.0,820.0,872.0,884.0,812.0,923.0,976.0,952.0,841.0
201608,1042.0,829.0,742.0,827.0,839.0,840.0,885.0,870.0,936.0,867.0,...,898.0,800.0,865.0,865.0,833.0,838.0,808.0,841.0,846.0,837.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201848,955.0,979.0,867.0,878.0,991.0,862.0,774.0,877.0,939.0,934.0,...,852.0,1006.0,947.0,846.0,833.0,758.0,917.0,805.0,848.0,986.0
201849,775.0,1017.0,1071.0,1043.0,974.0,952.0,1066.0,873.0,974.0,975.0,...,834.0,1005.0,914.0,1097.0,879.0,1165.0,1038.0,1079.0,1069.0,979.0
201850,862.0,868.0,962.0,939.0,951.0,1019.0,1023.0,1025.0,1003.0,925.0,...,1130.0,979.0,1047.0,931.0,1014.0,755.0,1063.0,980.0,1000.0,976.0
201851,921.0,761.0,710.0,805.0,804.0,732.0,753.0,793.0,709.0,743.0,...,774.0,737.0,690.0,728.0,754.0,753.0,716.0,716.0,799.0,774.0


In [27]:
sb1 = pd.read_csv('Cardiovascular disease/combi_samples/L8_S1/linearpool_P2.csv', parse_dates = [0], dayfirst = True)
sb1['epiweek'] = sb1['epiweek'].apply(create_epiweek_fromstr)
sb1 = sb1.set_index('epiweek') 
sb1

  sb1 = pd.read_csv('Cardiovascular disease/combi_samples/L8_S1/linearpool_P2.csv', parse_dates = [0], dayfirst = True)


Unnamed: 0_level_0,Cardiovascular disease,0,1,2,3,4,5,6,7,8,...,990,991,992,993,994,995,996,997,998,999
epiweek,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201605,906.0,890.271201,900.947473,894.023787,889.977870,882.609434,896.599311,883.904768,916.382316,946.797419,...,803.770398,870.451214,857.823094,887.398006,902.278990,836.001037,887.744299,928.189052,857.667572,898.611807
201606,861.0,901.377585,907.438450,903.326476,901.246522,896.558819,904.843723,897.202119,917.963113,930.533524,...,826.438806,892.122666,886.622573,899.919509,908.282625,874.681180,900.203079,924.172829,886.611108,905.988479
201607,948.0,899.338799,907.025535,903.324500,898.978762,889.089429,904.751690,890.260092,926.085273,949.430884,...,806.948637,880.249616,868.016737,895.744180,908.890520,843.378783,896.575733,937.202846,867.867582,904.963017
201608,1042.0,910.928606,921.750750,914.070472,910.721813,904.370696,916.672929,905.021221,947.508602,972.837222,...,793.788321,898.013501,896.109754,908.754913,923.396838,875.884720,909.151599,961.931216,896.104321,918.781204
201609,924.0,953.272857,970.944862,958.608055,952.914056,938.507069,962.883869,940.199556,1002.045052,1026.166158,...,870.205418,924.437756,911.073945,948.771076,973.960990,893.803539,949.709916,1015.199119,910.943437,966.052664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201848,955.0,914.619286,924.589886,917.973502,914.409129,906.405064,920.386714,907.475699,940.693819,959.916279,...,823.408326,898.674075,888.157970,912.046163,925.837232,857.027997,912.572484,950.981964,888.051429,922.345164
201849,775.0,913.863061,923.443397,916.697447,913.647732,906.641347,919.052833,907.468324,943.080409,976.638073,...,823.567063,899.586543,890.492391,911.534932,925.022486,870.646902,912.001978,956.405431,890.422311,920.891599
201850,862.0,869.023188,883.542235,873.772319,868.570380,856.971135,877.591402,858.471902,901.887549,930.578856,...,799.485085,844.164247,833.064628,865.235705,885.339593,829.755153,865.833402,914.425175,832.930322,880.291229
201851,921.0,900.681332,912.789845,904.267881,900.381209,892.599363,907.388151,893.456391,936.424141,960.262612,...,825.257572,887.787964,881.447615,897.994220,914.679381,870.664477,898.427221,948.676188,881.381697,909.675229
