In [None]:
import os

from math import isnan
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import timedelta
from typing import Union, Callable

In [None]:
data_dir = "../data/raw_in/"
file_name = "Risques 2/data_set_challenge.csv"
mapping_name = "Risques 2/final_mapping_candidat.csv"

df = pd.read_csv(os.path.join(data_dir, file_name), index_col=0)
mapping = pd.read_csv(os.path.join(data_dir, mapping_name))

In [None]:
df.head()

In [None]:
print(f"The data frame consists of {df.shape[0]} entries over {df.shape[1]} series")
print("-" * 55)
print(mapping.Type.value_counts())

In [None]:
#### ADD TO PREPROCESSING FUNCTION


# --- step 1: identify the different types of series
df.columns = [str(typ) + "_" + str(col) for col, typ in zip(df.columns, mapping.Type)]
# --- Share prices & Stock indexes
df_stock = df.loc[:, df.columns.str.contains("STOCK")]
# --- OAT bond (obligation assimilables au trésor) prices
df_bond = df.loc[:, df.columns.str.contains("BOND")]
# --- Exchange rate
df_xchang = df.loc[:, df.columns.str.contains("FXRATE")]
# --- Interests rate
df_yieldc = df.loc[:, df.columns.str.contains("YIELD_CURVE")]
# --- Commodity price
df_commod = df.loc[:, df.columns.str.contains("COMMO_CURVE_FO")]
# --- CDS Spread
df_cdsb = df.loc[:, df.columns.str.contains("CDS_BASKET_ZC")]

In [None]:
df_stock.head()

In [None]:
def plot_data(data_frame: pd.DataFrame, category: str, show_corr: bool) -> None:
    data_frame = data_frame.loc[:, data_frame.columns.str.contains(category)]

    if show_corr:
        fig, ax = plt.subplots(1, 2, figsize=(30, 15))
        # --- display missing values
        sns.heatmap(data_frame.isnull(),
                    cbar=False,
                    ax=ax[0])
        # --- display correlation heatmap
        corr = data_frame.corr()
        sns.heatmap(corr,
                    mask = np.triu(np.ones_like(corr, dtype=bool)),
                    ax = ax[1],
                    center=0)
    else:
        fig, ax = plt.subplots(1, 2, figsize=(30, 15))
        # --- display missing values
        sns.heatmap(data_frame.isnull(),
                    cbar=False,
                    ax=ax[1])
    return None

plot_data(data_frame=df, category='BOND', show_corr=True)

# Imputing with correlations

Suppose we are missing value at timestamp **t** for time serie **i**:

- We look at the **growth rate** between time **t-1** and **t** for all available time series.

- To **weight** the actual relevance of the obtained growth rate for each time serie, we use the overall correlation with the original time serie **i**.

- We then infer the growth rate of serie **i** at time **t**: <br> <br>
$ ImputedGrowthRate_i (t) = \frac{\sum_{j \neq i} GrowthRate_j (t)  *  Corr(i,j)} {\sum_{j \neq i} Corr(i,j)}$ 
<br><br> where $Corr(i,j)$ is the correlation of **returns** (not absolute values) of series i and j across all period.


- And thus the value of serie **i** at time **t**: <br> <br>
$  TimeSerie_i(t) = ImputedGrowthRate_i (t) * TimeSerie_i(t-1)$ 

Finally, instead of using all correlations raw, we can pre-process them before using them as weights (cf. below)

### Computing growth rate and correlation matrices from original dataframe

In [None]:
def get_growth_rates_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    :param df: the original data frame, with missing values
    :return: observed growth rates on the original data frame for each period (nan when infinite or unavailable)
    """
    growth_rates = df.pct_change(fill_method=None)
    growth_rates.replace([np.inf, -np.inf], np.nan, inplace=True)  # Change inf values to na, will be dropped later
    return growth_rates

In [None]:
def get_correlations_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    :param df: the original data frame, with missing values
    :return: a correlation matrix of the original data frame, based on pct changes each period (not absolute values)
    """
    df_returns = get_growth_rates_df(df)
    corr_df = df_returns.corr()
    # Set to zero when correlation cannot be computed (ex. constant time series)
    corr_df = corr_df.fillna(0)
    return corr_df

### Various 'activation' functions for correlations

In [None]:
def truncate_below_threshold(num: float, threshold: float = 0.5) -> float:
    """
    :param num: a correlation float
    :param threshold: threshold under which correlations should be disregarded
    :return: either the original correlation, or zero
    """
    if abs(num) < threshold:
        return 0

    return num

In [None]:
PRECISION_EPSILON = np.finfo(float).eps

In [None]:
def inverse_distance_to_1(num: float) -> float:
    """
    :param num: a correlation float
    :return: a positive weight that tends to +inf as correlation approaches 1
    """
    return 1 / (1 - num + PRECISION_EPSILON)

In [None]:
def mixed_truncate_inverse_distance(num: float, threshold: float = 0.9) -> float:
    """
    :param num: a correlation float
    :param threshold: threshold under which correlations should be disregarded
    :return: either a positive weight that tends to +inf as correlation approaches 1, or zero
    """
    if abs(num) < threshold:
        return 0

    return num / (1 - num + PRECISION_EPSILON)

In [None]:
def plot_f(f: Callable[[float], float]) -> None:
    """
    Plot activation functions
    """
    points = 10000  # Number of points
    xmin, xmax = 0.01, 0.99
    xlist = [float(xmax - xmin) *i / points for i in range(points+1)]
    ylist = [f(x) for x in xlist]
    plt.plot(xlist, ylist)
    plt.show()

In [None]:
plot_f(truncate_below_threshold)

In [None]:
plot_f(inverse_distance_to_1)

In [None]:
plot_f(mixed_truncate_inverse_distance)

In [None]:
def activate_correlations(corr_df: pd.DataFrame, activation_f: Callable[[float], float]) -> pd.DataFrame:
    """
    Activate a correlation data frame column by column, value by value, with pandas parallelism
    :param corr_df: a correlation matrix
    :param activation_f: a correlation pre processing function
    :return: a symmetric matrix of weights based on correlations
    """
    return corr_df.apply(lambda corr_series: corr_series.apply(activation_f), axis=0)

### Two highly correlated series (>0.99)

In [None]:
name0 = "BOND_0"
name1 = "BOND_1"
plt.plot(df[name0], label=name0)
plt.plot(df[name1], label=name1)
plt.xticks([])
plt.legend()
plt.show()

### Computing missing values out of correlations and growth rates:

In [None]:
def prev_day(timestamp_str: str, date_format: str = '%d/%m/%Y') -> str:
    """
    Takes a date string in a particular format, returns the date string of the day before in the same format
    """
    timestamp = pd.to_datetime(timestamp_str, format=date_format)
    prev_day_timestamp = timestamp + timedelta(days=-1)
    prev_day_timestamp_str = prev_day_timestamp.strftime(date_format)
    return prev_day_timestamp_str

In [None]:
def get_previous_available_day(timestamp_str: str, available_days: pd.Index) -> str:
    """
    Takes a current date string, returns the previous date string in an index of possible of date strings
    :param timestamp_str: the current date string
    :param available_days: the index of all date-strings with observations
    :return: the previous date string observed in the list
    """
    prev_timestamp_str = prev_day(timestamp_str)
    while prev_timestamp_str not in available_days:
        prev_timestamp_str = prev_day(prev_timestamp_str)
    return prev_timestamp_str

In [None]:
def get_imputed_value(time_series: pd.Series, timestamp_str: str, imputed_growth_rate: Union[float, None]) -> float:
    """
    Returns an imputed value at a given time based on the last value and an imputed growth rate
    """
    if imputed_growth_rate is None:
        return np.nan

    prev_timestamp_str = get_previous_available_day(timestamp_str, time_series.index)
    prev_value = time_series[prev_timestamp_str]

    # Multiply by imputed growth rate
    imputed_value = prev_value * (1 + imputed_growth_rate)

    return imputed_value

In [None]:
def get_imputed_growth_rate(corr_series_activated: pd.Series, growth_rates: pd.Series) -> Union[float, None]:
    """
    Returns an average of observed growth rates, weighted by pre processed correlations of the corresponding series
    """
    # Retrieve indices where growth rate is available
    not_nan_indices = np.where(np.logical_not(np.isnan(growth_rates)))[0]
    # Compute average of growth rates weighted by 'activated' correlation
    if np.sum(corr_series_activated[not_nan_indices]) == 0:
        return None
    imputed_growth_rate = np.average(growth_rates[not_nan_indices], weights=corr_series_activated[not_nan_indices])
    return imputed_growth_rate

In [None]:
def fill_in_time_series(df_time_series: pd.DataFrame, series_id: str,
                        corr_df_activated: pd.DataFrame, growth_rates_df: pd.DataFrame) -> pd.Series:
    """
    Impute missing value on a time series, using correlations and growth rates
    """
    time_series_filled = pd.Series(index=df_time_series[series_id].index, name=series_id)
    not_started = True
    for timestamp_str, value in df_time_series[series_id].items():

        if isnan(value):
            if not_started:
                # data not yet available for this time series
                continue

            corr_series_activated = corr_df_activated[series_id]
            growth_rate_series = growth_rates_df.loc[timestamp_str]
            imputed_growth_rate = get_imputed_growth_rate(corr_series_activated, growth_rate_series)
            imputed_value = get_imputed_value(time_series_filled, timestamp_str, imputed_growth_rate)
            time_series_filled[timestamp_str] = imputed_value

        else:
            if not_started:
                not_started = False

            time_series_filled[timestamp_str] = value

    return time_series_filled

In [None]:
def fill_in_data_frame(df_time_series: pd.DataFrame, corr_df_activated: pd.DataFrame,
                       growth_rates_df: pd.DataFrame) -> pd.DataFrame:
    """
    Impute missing value on the data frame, using correlations and growth rates
    """

    tqdm.pandas()  # show progress bar

    df_time_series_filled = df_time_series.progress_apply(
        lambda time_series: fill_in_time_series(df_time_series, time_series.name, corr_df_activated, growth_rates_df),
        axis=0
    )

    # Simply interpolate when correlation method fails to impute (ex. no series sufficiently correlated)
    df_final = df_time_series_filled.interpolate(method='linear', limit=None, limit_direction='forward')

    # Compute and display some metrics
    num_accepted_nans = df_final.isna().sum().sum()
    num_values = df_final.count().sum()
    pct_originally_missing = (df_time_series.isna().sum().sum() - num_accepted_nans) / num_values
    pct_missing_after_corr_method = (df_time_series_filled.isna().sum().sum() - num_accepted_nans) / num_values
    print(f"Originally missing: {round(pct_originally_missing*100,1)}%")
    print(f"Still missing after correlation imputation method: {round(pct_missing_after_corr_method*100,1)}%")
    print(f"Missing after final interpolation: 0.0%")

    return df_final

In [None]:
def impute_df_with_correlations(df: pd.DataFrame, corr_activation_f: Callable[[float], float]) -> pd.DataFrame:
    """
    :param df: the original data frame, with missing values
    :param corr_activation_f: a correlation pre processing function to compute weights
    :return: the original data frame with imputed missing values, using correlations and growth rates
    """
    print("Performing preliminary calculations...")
    growth_rates_df = get_growth_rates_df(df)
    corr_df = get_correlations_df(df)
    corr_df_activated = activate_correlations(corr_df, corr_activation_f)

    print("Imputing missing values...")
    df_imputed = fill_in_data_frame(df, corr_df_activated, growth_rates_df)
    return df_imputed

In [None]:
df_imputed = impute_df_with_correlations(df, mixed_truncate_inverse_distance)

In [None]:
plot_data(dataframe=df_imputed, category='BOND', show_corr=True)