In [1]:
## 04 (1)Evaluating individual models using metrics and (2) calcualte most recent errors for combinatnions weights

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import itertools
import statsmodels.api as sm
import os
from joblib import Parallel, delayed
import properscoring as ps
from scipy.stats import norm
from scipy.stats import gaussian_kde

In [19]:
from epiweeks import Week, Year
from datetime import date
def create_epiweek(date):
    return Week.fromdate(date)
def create_epiweekplot(epiweek):
    epiweek = str(epiweek)
    return F'Y{epiweek[:4]}W{epiweek[4:]}'
def filename_to_epiweek(filename):
    return Week.fromstring(F'{filename[:4]}W{filename[4:6]}')
def create_epiweek_fromstr(str):
    return Week.fromstring(str)
def create_epiweek_fromint(int):
    return Week.fromstring(str(int))

In [11]:
import os
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import properscoring as ps

def outofsamples_crps(target_var, pred_directory, crps_directory, comparison_operator):
    model_names_order = ['naive', 'historymean', 'ar_pure', 'ar_env', 'ridge', 'lasso', 'alasso', 'sgl',
                 'elasticnet', 'purefactor', 'knn', 'xgboost']
    # Paths setup
    pred_directory_path = os.path.join(target_var, pred_directory)
    crps_directory_path = os.path.join(target_var, crps_directory)
    
    if not os.path.exists(crps_directory_path):
        os.makedirs(crps_directory_path)
    
    for step_name in os.listdir(pred_directory_path):
        pred_models_path = os.path.join(pred_directory_path, step_name)
        if os.path.isdir(pred_models_path):
            full_crps_df = pd.DataFrame()
            
            for model_name in os.listdir(pred_models_path):
                pred_file = os.path.join(pred_models_path, model_name)
                if os.path.isfile(pred_file):
                    y_pred = pd.read_csv(pred_file, parse_dates=[0], dayfirst=True) # Ill-disease L8_S9 lasso.csv has some unseen format isuue
                    model_crps_df = pd.DataFrame()

                    for filename in y_pred['epiweek']:
                        # Apply the comparison operator
                        if comparison_operator == '<=':
                            y_pred_new = y_pred[y_pred['epiweek'] <= filename]
                        elif comparison_operator == '==':
                            y_pred_new = y_pred[y_pred['epiweek'] == filename]
                        else:
                            raise ValueError("Invalid comparison_operator: must be '<=' or '=='")
                        
                        y_pred_outofsamples = pd.DataFrame(y_pred_new.iloc[:, 1:], dtype='float64')
                        
                        # Calculate CRPS for this epiweek
                        crps_values = []
                        for i in range(len(y_pred_outofsamples)):
                            crps = ps.crps_ensemble(
                                y_pred_outofsamples.iloc[i, 0], y_pred_outofsamples.iloc[i, 1:]
                            )
                            crps_values.append(crps)
                        
                        mean_crps = np.mean(crps_values)
                        model_crps_df.at[filename, model_name[:-4]] = mean_crps
                    
                    full_crps_df = pd.concat([full_crps_df, model_crps_df], axis=1)
            full_crps_df = full_crps_df[model_names_order]
            full_crps_df.sort_index(inplace=True)
            full_crps_df.to_csv(os.path.join(crps_directory_path, f'{step_name}.csv'))

def generate_full_crps_P1(target_var, pred_directory, crps_directory_P1):
    model_names_order = ['naive', 'historymean', 'ar_pure', 'ar_env', 'ridge', 'lasso', 'alasso', 'sgl',
                 'elasticnet', 'purefactor', 'knn', 'xgboost']
    # Paths setup
    pred_directory_path = os.path.join(target_var, pred_directory)
    crps_directory_path = os.path.join(target_var, crps_directory_P1)
    
    if not os.path.exists(crps_directory_path):
        os.makedirs(crps_directory_path)
    
    for step_name in os.listdir(pred_directory_path):
        pred_models_path = os.path.join(pred_directory_path, step_name)
        if os.path.isdir(pred_models_path):
            full_crps_df = pd.DataFrame()
            for model_name in os.listdir(pred_models_path):
                pred_file = os.path.join(pred_models_path, model_name)
                if os.path.isfile(pred_file):
                    y_pred = pd.read_csv(pred_file, parse_dates=[0], dayfirst=True)
                    index = y_pred['epiweek'].unique()
                    
                    full_crps_df = pd.DataFrame(1, index=index, columns=model_names_order)
                    full_crps_df.sort_index(inplace=True)
                    
                    full_crps_df.to_csv(os.path.join(crps_directory_path, f'{step_name}.csv'))
                    break  # We only need to do this once per step_name


In [12]:
def run_crps(target_variables_file, pred_directory, crps_directory_P3, crps_directory_P2, crps_directory_P1):
    target_variables = []
    with open(target_variables_file, 'r') as file:
        for line in file:
            target_variable = line.strip()
            target_variables.append(target_variable)
    
    print(target_variables)
    
    Parallel(n_jobs=-1, verbose=51)(
        delayed(outofsamples_crps)(target_var, pred_directory, crps_directory_P3, '<=') 
        for target_var in target_variables
    )
    
    Parallel(n_jobs=-1, verbose=51)(
        delayed(outofsamples_crps)(target_var, pred_directory, crps_directory_P2, '==') 
        for target_var in target_variables
    )
    
    Parallel(n_jobs=-1, verbose=51)(
        delayed(generate_full_crps_P1)(target_var, pred_directory, crps_directory_P1) 
        for target_var in target_variables
    )

# Execute the function
run_crps('target_variables_new.txt', 'pred', 'full_crps_P3', 'full_crps_P2', 'full_crps_P1')

['Cardiovascular disease', 'Chronic respiratory disease', 'Factors influencing health status and contact with health services', 'Digestive disease', 'Endocrine disorders', 'Malignant neoplasms', 'Diabetes mellitus', 'Genitourinary disorders', 'Musculoskeletal disease', 'Infectious and Parasitic Diseases', 'Neurological and sense disorders', 'Oral Diseases', 'Other neoplasms', 'Respiratory Infection', 'Skin diseases']
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  2.6min remaining: 16.9min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:  2.6min remaining: 10.4min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  2.9min remaining:  7.9min
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:  2.9min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:  2.9min remaining:  4.3min
[Parallel(n_jobs=-1)]: Done   7 out

## calculate crps/log score for submodels

In [13]:
def crps(y_val, y_pred, model, target_var):
    np.random.seed(0)
    crps_df = pd.DataFrame()
    
    for epiweek in y_val.index:
        
        crps_df.at[epiweek, model] = ps.crps_ensemble(y_val.loc[epiweek, target_var], 
                                                      np.array(y_pred.loc[epiweek], dtype='float64'))
    
    return crps_df

In [14]:
# def dss(y_val, y_pred, model, target_var):
#     dss_df = pd.DataFrame()

#     for epiweek in y_val.index:
#         mean = np.mean(y_pred.loc[epiweek])

#         # Calculate the variance (sample variance)
#         variance = np.var(y_pred.loc[epiweek], ddof=1)
#         variance = np.maximum(variance, 1e-6)
#         # Calculate DSS for the current epiweek and model
#         dss = ((y_val.loc[epiweek, target_var] - mean)**2 / variance) + np.log(variance)
#         dss_df.at[epiweek, model] = dss

#     return dss_df

In [15]:
def log(y_val, y_pred, model, target_var):
    log_df = pd.DataFrame()
    
    for epiweek in y_val.index:
        samples = np.array(y_pred.loc[epiweek], dtype='float64')
        kde = gaussian_kde(samples)
        prob_density = kde(y_val.loc[epiweek, target_var])
        prob_density = max(prob_density, 1e-9)  # To avoid log(0)

        log_score = -np.log(float(prob_density))
        log_df.at[epiweek, model] = log_score        
    
    return log_df

In [16]:
def disease_crps(target_var, pred_directory, density_forecast_directory):
    model_names_order = ['naive', 'historymean', 'ar_pure', 'ar_env', 'ridge', 'lasso', 'alasso', 'sgl',
                 'elasticnet', 'purefactor', 'knn', 'xgboost']
    pred_directory_path = os.path.join(target_var, pred_directory)
    density_forecast_directory_path = os.path.join(target_var, density_forecast_directory)
    if not os.path.exists(density_forecast_directory_path):
        os.makedirs(density_forecast_directory_path)
    
    for step_name in os.listdir(pred_directory_path):
        pred_models_path = os.path.join(pred_directory_path,step_name)
        
        if os.path.isdir(pred_models_path):
            model_list = []
            crps_density_forecast_df = pd.DataFrame()
            log_density_forecast_df = pd.DataFrame()
            for model_name in os.listdir(pred_models_path): # 'model_name' here includes the '.csv'
                pred_file = os.path.join(pred_models_path, model_name)
                model = model_name[0:-4]
                
                if os.path.isfile(pred_file):
                    model_list.append(model) # to store the models' names
                    y_pred = pd.read_csv(pred_file, parse_dates = [0], dayfirst = True)  
                    y_pred['epiweek'] = y_pred['epiweek'].apply(create_epiweek_fromstr)
                    y_pred = y_pred.set_index('epiweek')

                    crps_col = crps(y_pred[[target_var]].copy(), y_pred.iloc[:,1:].copy(), model, target_var)
                    crps_density_forecast_df = pd.concat([crps_density_forecast_df, crps_col], axis=1)
                    log_col = log(y_pred[[target_var]].copy(), y_pred.iloc[:,1:].copy(), model, target_var)
                    log_density_forecast_df = pd.concat([log_density_forecast_df, log_col], axis=1)


            crps_density_forecast_df.columns = model_list
            log_density_forecast_df.columns = model_list
            
            crps_density_forecast_df = crps_density_forecast_df[model_names_order]
            log_density_forecast_df = log_density_forecast_df[model_names_order]

            density_forecast_output = pd.DataFrame()
            for col in crps_density_forecast_df.columns:
                density_forecast_output.at[col, 'crps_DENSITY_FORECAST'] = crps_density_forecast_df[col].mean()
                density_forecast_output.at[col, 'log_DENSITY_FORECAST'] = log_density_forecast_df[col].mean()
            density_forecast_output.to_csv(os.path.join(density_forecast_directory_path, F'{step_name}.csv'))
            
#disease_crps('Cardiovascular disease', 'pred', 'variance', 'density_forecast')

In [17]:
## This function calculates the density forecast based on the output prediction forecast and calculated variance

def run_full_crps(target_variables_file, pred_directory, density_forecast_directory):
    target_variables = []
    with open(target_variables_file, 'r') as file:
        for line in file:
            # Remove linebreak which is the last character of the string
            target_variable = line[:-1]
            # Add item to the list
            target_variables.append(target_variable)
    print(target_variables)
    Parallel(n_jobs=-1, verbose=51)(delayed(disease_crps)(target_var, pred_directory, density_forecast_directory) for target_var in target_variables)
    
run_full_crps('target_variables_new.txt', 'pred', 'density_forecast_metrics')

['Cardiovascular disease', 'Chronic respiratory disease', 'Factors influencing health status and contact with health services', 'Digestive disease', 'Endocrine disorders', 'Malignant neoplasms', 'Diabetes mellitus', 'Genitourinary disorders', 'Musculoskeletal disease', 'Infectious and Parasitic Diseases', 'Neurological and sense disorders', 'Oral Diseases', 'Other neoplasms', 'Respiratory Infection', 'Skin diseases']
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:   13.4s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed:   13.5s remaining:   54.1s
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:   16.5s remaining:   45.4s
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed:   16.6s remaining:   33.2s
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:   16.6s remaining:   24.9s
[Parallel(n_jobs=-1)]: Done   7 out