In [None]:
import os
import random
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append(os.path.abspath("../"))

In [None]:
# --- load data
data_dir = "../data/raw_in/"
file_name = "Risques/dataset_final_scenario_4.csv"
mapping_name = "Risques 2/final_mapping_candidat.csv"

df = pd.read_csv(os.path.join(data_dir, file_name), index_col=0)
mapping = pd.read_csv(os.path.join(data_dir, mapping_name))

In [None]:
print(f"the dataframe consists of {df.shape[0]} entries over {df.shape[1]} series")
print("-" * 55)
print(mapping.Type.value_counts())

In [None]:
#### ADD TO PREPROCESSING FUNCTION
# --- identify the different types of series
df.columns = [str(typ) + "_" + str(col) for col, typ in zip(df.columns, mapping.Type)]

In [None]:
from utils.utils_preprocessing import *
from utils.utils_visualization import *

df_full, df_miss = get_evaluation_set(
    df.reset_index().drop("Date", axis=1), method="linear"
)

# plot_data(dataframe=df_full, category="STOCK", show_corr=False)
plot_data(dataframe=df_miss, category="BOND", show_corr=False)
# plot_data(dataframe=df_pred, category="STOCK", show_corr=False)

In [None]:
# from utils.utils_correlations import *
# --- baseline prediction
df_pred_0 = df_miss.interpolate(method="linear", limit=None, limit_direction="forward")
# --- correlation-based prediction
df_pred_corr = impute_df_with_correlations(
    df_miss.set_index(df.index), mixed_truncate_inverse_distance
)

In [None]:
# --- sanity checks
print("df_full:\t", df_full.isna().sum().sum())
print("df_pred:\t", df_pred_0.isna().sum().sum())

In [None]:
from utils.utils_evaluation import *

results_0 = eval_imputation(df_full, df_pred_0, df_miss)
results_corr = eval_imputation(
    df_full, df_pred_corr.reset_index().drop("Date", axis=1), df_miss
)

In [None]:
# --- correlations evaluation
df_results_0 = pd.DataFrame(results_0[0], index=["nrmse", "nan"]).transpose()
df_results_0["type"] = [s[0] for s in df_results_0.index.str.split("_")]
df_results_0["method"] = "linear"

df_results_corr = pd.DataFrame(results_corr[0], index=["nrmse", "nan"]).transpose()
df_results_corr["type"] = [s[0] for s in df_results_corr.index.str.split("_")]
df_results_corr["method"] = "correlations"

df_results = pd.concat([df_results_0, df_results_corr], axis=0)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 15))
ax = sns.boxplot(x="type", y="nrmse", hue="method", data=df_results)
# ax.set(yscale='log')

ax.set_ylim([0, 5])
ax.set_xlabel("Series Type", fontsize=30)
ax.set_ylabel("NRMSE", fontsize=30)

ax.legend(loc="upper left", fontsize=25)

In [None]:
# --- xgboost vs correlation-based vs baseline
df_miss = pd.read_csv("../data/df_miss_07_mean.csv", index_col=0)
df_full = pd.read_csv("../data/df_full_07_mean.csv", index_col=0)

from utils.utils_correlations import *
# --- xgboost
df_pred_xgb = pd.read_csv(
    "../data/xgboost_preds_eval_meanlag-0.7.csv", index_col=0
).interpolate(method="linear", limit=None, limit_direction="forward")
# --- correlations
df_pred_corr = impute_df_with_correlations(
    pd.read_csv("../data/df_miss_07_mean.csv", index_col=0).set_index(
        df.index
    ),
    mixed_truncate_inverse_distance,
)
# --- baseline
df_pred_0 = pd.read_csv("../data/df_miss_07_mean.csv", index_col=0).interpolate(
    method="linear", limit=None, limit_direction="forward"
)

In [None]:
# --- sanity checks
print("df_full:\t\t", df_full.isna().sum().sum())
print("df_pred (xgb):\t\t", df_pred_xgb.isna().sum().sum())
print("df_pred (correlations):\t", df_pred_corr.isna().sum().sum())
print("df_pred (baseline):\t", df_pred_0.isna().sum().sum())

In [None]:
from utils.utils_evaluation import *
results_xgb = eval_imputation(df_full, df_pred_xgb, df_miss)
results_corr = eval_imputation(df_full, df_pred_corr.reset_index().drop('Date', axis=1), df_miss)
results_baseline = eval_imputation(df_full, df_pred_0, df_miss)

In [None]:
df_res=pd.DataFrame()
for res,method in zip([results_xgb, results_corr, results_baseline], ['xgboost', 'correlations', 'baseline']):
    df_temp = pd.DataFrame(res[0], index=["nrmse", "nan"]).transpose()
    df_temp["type"] = [s[0] for s in df_temp.index.str.split("_")]
    df_temp["method"] = method
    
    df_res = pd.concat([df_res, df_temp], axis=0)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(30, 15))
ax = sns.boxplot(x="type", y="nrmse", hue="method", data=df_res)
# ax.set(yscale='log')

ax.set_ylim([0, 5])
ax.set_xlabel("Series Type", fontsize=30)
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize=30)
ax.set_ylabel("NRMSE", fontsize=30)

ax.legend(loc="upper left", fontsize=25)