In [None]:
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from scipy.linalg import svd
from scipy.stats import norm, chi2, binomtest
from IPython.core.display import display, HTML

In [None]:
def save_matrix(mat, path, ages=range(35,65+1), years=range(2015,2022+1)):
    df = pd.DataFrame(mat, columns=[f"Age {i}" for i in ages], index=[f'Year {i}' for i in years])
    df.to_csv(path, sep=',', index=True, encoding='utf-8')

def load_matrix(filepath, ages=range(35,65+1), years=range(2015,2022+1)):
    df = pd.read_csv(filepath, index_col=0)
    df.columns = [int(col) if isinstance(col, int) or col.isdigit() else int(col[4:]) for col in df.columns]
    df.index = [int(idx) if isinstance(idx, int) or idx.isdigit() else int(idx[5:]) for idx in df.index]
    # Find common years and ages
    common_years = df.index.intersection(years)
    common_ages = df.columns.intersection(ages)
    # Filter dataframes to only include common years and ages
    mat = df.loc[common_years, common_ages].to_numpy()
    return mat

def plot_matrix(X, Y, Z, title="Taux de mortalité", save=None):
    X, Y = np.meshgrid(X, Y)
    
    # Create a 3D plot
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot the surface
    surface = ax.plot_surface(X, Y, Z, cmap='plasma', edgecolor='none')
    
    # Add a color bar which maps values to colors
    fig.colorbar(surface, shrink=0.5, aspect=10, pad=0.1)
    
    # Set titles and labels
    ax.set_title(title)
    ax.set_xlabel('Age', labelpad=15)
    ax.set_ylabel('Année', labelpad=15)
    ax.set_zlabel('Taux', labelpad=15)

    ax.tick_params(axis='z', labelsize=8, pad=5)
    
    ax.view_init(elev=40, azim=120) 

    if save is not None:
        plt.savefig(save, bbox_inches='tight')
    
    # Show the plot
    plt.show()

In [None]:
txt_values = """1,846774194
1,846774194
1,846774194
1,846774194
1,846774194
1,846774194
1,846774194
1,846774194
1,846774194
1,84
1,828947368
1,822916667
1,81025641
1,800505051
1,791563275
1,781021898
1,767220903
1,761020882
1,751693002
1,745614035
1,740976645
1,734151329
1,734251969
1,732580038
1,737410072
1,741880342
1,752025932
1,75975039
1,778735632
1,795698925
1,815789474
1,838372093
1,864370291
1,891865079
1,920765027
1,95
1,980988593
2,010366275
2,04137931
2,070294785
2,098721228
2,124769797
2,148883375
2,170930664
2,190175904
2,206466924
2,218898888
2,228881548
2,234663866
2,237151934
2,235841222
2,230349506
2,221727749
2,209078245
2,192754966
2,172913817
2,149539506
2,122773127
2,092740403
2,059668013
2,023727972
1,985104794
1,943956129
1,900475076
1,854905068
1,807555316
1,758536644
1,70818076
1,65670774
1,604342261
1,551339319
1,497985717
1,445357159
1,411676036
1,385715643
1,359381472
1,332762817
1,305947833
1,279020422
1,252054353
1,225128258
1,198348474
1,17177948
1,145514863
1,119614944
1,094175017
1,069262102
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
1,044943367
"""

factors = np.array([float(v.replace(",", ".")) for v in txt_values.split()])
factor_ages = range(12,120+1)

assert len(factors) == len(factor_ages)

In [None]:
def factorize_matrix(name, model, factors, factor_ages):
    feminine = name[-1] == "F"
    input_filepath = f"./matrices/{model}_{name}_all.csv"
    output_filepath = f"./matrices/{model}_{name}_all_fumeurs.csv"

    df = pd.read_csv(input_filepath, index_col=0)
    df.columns = [int(col[4:]) for col in df.columns]
    df.index = [int(idx[5:]) for idx in df.index]

    common_ages = df.columns.intersection(factor_ages)
    common_factors = np.array([factors[i] for i, age in enumerate(factor_ages) if age in common_ages])
    
    #plot_matrix(df.columns, df.index, df.to_numpy(), title=f"{model}: {name}", save=f"images/{model}_{name}.png")

    restricted_ages = [age for age in common_ages if age >=35 and age <= 65]
    if model == "BRASS":
        years = [idx for idx in df.index if idx >= 2022]
        matrix = df.loc[years,common_ages].to_numpy()
        restricted_matrix = df.loc[years, restricted_ages].to_numpy()
    else:
        years = df.index
        matrix = df.loc[:,common_ages].to_numpy()
        restricted_matrix = df.loc[:, restricted_ages].to_numpy()
    

    adjusted_matrix = matrix * common_factors
    adjusted_restricted_matrix = restricted_matrix * [common_factors[i] for i, age in enumerate(common_ages) if age in restricted_ages]
    # print(matrix.shape, adjusted_matrix.shape)
    # print(np.mean(matrix), np.mean(adjusted_matrix))
    # print(adjusted_matrix[0,:] / matrix[0,:])

    # non fumeurs
    plot_matrix(df.columns, years, df.loc[years,:].to_numpy(), title=f"{model}: {name} (non fume{'uses' if feminine else 'urs'})", save=f"images/{model}_{name}_non_fumeurs_all.png")
    plot_matrix(restricted_ages, years, restricted_matrix, title=f"{model}: {name} (non fume{'uses' if feminine else 'urs'})", save=f"images/{model}_{name}_non_fumeurs.png")

    # fumeurs
    plot_matrix(common_ages, years, adjusted_matrix, title=f"{model}: {name} (fume{'uses' if feminine else 'urs'})", save=f"images/{model}_{name}_fumeurs_all.png")
    plot_matrix(restricted_ages, years, adjusted_restricted_matrix, title=f"{model}: {name} (fume{'uses' if feminine else 'urs'})", save=f"images/{model}_{name}_fumeurs.png")

    save_matrix(adjusted_matrix, output_filepath, ages=common_ages, years=years)

names = ["IRL_F", "IRL_M", "UK_F", "UK_M"]

for name in names:
    for model in ["BONGAARTS", "BRASS"]:
        factorize_matrix(name, model, factors, factor_ages)

    save_dicts = [
        {
            "input_csv": f"./matrices/BRUT_{name}.csv",
            "output_png": f"images/BRUT_{name}.png",
            "title": f"BRUTS: {name}",
            "transpose": False
        },
        {
            "input_csv": f"./HMD_inputs/matrices/projected_HMD_{name}.csv",
            "output_png": f"images/HMD_LC_{name}_all.png",
            "output_restricted_png": f"images/HMD_LC_{name}.png",
            "title": f"HMD LEE CARTER: {name}",
            "transpose": True
        }
    ]
    
    for save_dict in save_dicts:
        df = pd.read_csv(save_dict["input_csv"], index_col=0)
        if save_dict["transpose"]:
            df = df.T
        df.columns = [int(col) if isinstance(col, int) or col.isdigit() else int(col[4:]) for col in df.columns]
        df.index = [int(idx) if isinstance(idx, int) or idx.isdigit() else int(idx[5:]) for idx in df.index]

        plot_matrix(df.columns, df.index, df.to_numpy(), title=save_dict["title"], save=save_dict["output_png"])

        if "output_restricted_png" in save_dict:
            restricted_ages = [age for age in df.columns if age >= 35 and age <=65]
            plot_matrix(restricted_ages, df.index, df.loc[:,restricted_ages].to_numpy(), title=save_dict["title"], save=save_dict["output_restricted_png"])

In [None]:
def SMR(mat_model, mat_brut, mat_expo):
    """
    Computes the element-wise ratio of mat_model and mat_brut.
    """
    return np.divide(mat_model, mat_brut+10e-9)

def compute_average_smr(input_mat, model_mat, expo_mat, age_ranges, ages):
    avg_smr_results = {}
    
    for (start_age, end_age) in age_ranges:
        # Identify the indices for the specified age range
        age_indices = [i for i, age in enumerate(ages) if start_age <= age <= end_age]
        
        # Extract the relevant columns from the matrices
        input_subset = input_mat[:, age_indices]
        model_subset = model_mat[:, age_indices]
        expo_subset = expo_mat[:, age_indices]
        
        # Compute the SMR for this subset
        smr_subset = SMR(model_subset, input_subset, expo_subset)
        
        # Calculate the average SMR over the age range for each year
        avg_smr = np.mean(smr_subset, axis=1)
        
        # Store the result in a dictionary with the age range as the key
        avg_smr_results[str((start_age, end_age))] = avg_smr
        
    return avg_smr_results

def chisquared(mat_model, mat_brut, mat_expo, nb_params=0):
    """
    Computes the Chi-squared statistic and the associated p-value between the model and the observed data.
    """
    # Ensure the matrices have the same shape
    if mat_model.shape != mat_brut.shape or mat_brut.shape != mat_expo.shape:
        raise ValueError("The input matrices must have the same shape.")
    
    # Compute the Chi-squared statistic
    num = mat_expo * ((mat_model*mat_brut)**2)
    denom = mat_brut + 10e-9
    chi_squared = np.sum(num / denom)
    
    # Degrees of freedom
    n_rows,n_cols = mat_model.shape
    degrees_of_freedom = n_rows*n_cols - nb_params - 1
    
    # Compute the p-value
    # The P-value is the area under the density curve of this chi-square distribution to the right of the value of the test statistic
    # for this we use the cumulative distribution function of chi2
    area_left = chi2.cdf(chi_squared, degrees_of_freedom)
    p_value = 1 - area_left
    
    return chi_squared, p_value

def MAPE(mat_model, mat_brut):
    """
    Computes the Mean Absolute Percentage Error (MAPE) between mat_model and mat_brut,
    only considering elements where mat_brut > 0.

    Args:
        mat_model (np.ndarray): The modeled matrix.
        mat_brut (np.ndarray): The observed (brut) matrix.

    Returns:
        float: The MAPE value.
    """
    # Create a mask for elements where mat_brut > 0
    mask = mat_brut > 0

    # Apply the mask to both matrices
    mat_model_filtered = mat_model[mask]
    mat_brut_filtered = mat_brut[mask]

    # Compute MAPE only on the filtered values
    return np.mean(np.abs((mat_brut_filtered - mat_model_filtered) / mat_brut_filtered)) * 100

def R_squared(mat_model, mat_brut):
    """
    Computes the coefficient of determination (R^2) between mat_model and mat_brut.
    """
    ss_res = np.sum(np.square(mat_brut - mat_model))
    ss_tot = np.sum(np.square(mat_brut - np.mean(mat_brut)))
    return 1 - (ss_res / ss_tot)

def sign_test(mat_model, mat_brut):
    """
    Computes the sign statistic, which counts the number of positive differences between mat_model and mat_brut.
    """
    diff = mat_model - mat_brut

    # Comptage des signes positifs et négatifs (en excluant les zéros)
    n_pos = np.sum(diff > 0)
    n_neg = np.sum(diff < 0)
    n = mat_model.size

    assert n_pos + n_neg == n

    print(f"{n_pos/n},{n_neg/n}")

    xi = (np.abs(n_pos - n_neg) -1)/np.sqrt(n)
    p_value = 2*(1-norm.cdf(np.abs(xi)))

    return p_value

def run_test(mat_model, mat_brut):
    diff = mat_model - mat_brut
    diff = np.ravel(diff)
    bool_diff = diff > 0

    # Comptage des signes positifs et négatifs (en excluant les zéros)
    n_pos = np.sum(diff > 0)
    n_neg = np.sum(diff < 0)
    n = mat_model.size

    assert n_pos + n_neg == n

    mu = 2*(n_pos*n_neg)/(n_pos + n_neg) +1
    sigma = 2*(n_pos*n_neg)*(2*n_pos*n_neg-(n_pos + n_neg))/((n_pos + n_neg -1)*(n_pos + n_neg)**2)
    # Count changes between consecutive elements
    run_count = 1  # Start with the first run
    for i in range(1, len(bool_diff)):
        if bool_diff[i] != bool_diff[i - 1]:
            run_count += 1

    xi = (run_count - mu)/sigma
    p_value = 2*(1-norm.cdf(np.abs(xi)))

    return p_value


def calculate_metrics(mat_model, mat_brut, mat_expo, age_ranges, ages, nb_params=0):
    """
    Calculates various statistical metrics between the model matrix and the observed matrix.

    Args:
        mat_model (np.ndarray): The modeled matrix.
        mat_brut (np.ndarray): The observed (brut) matrix.
        age_ranges (list of tuples): the age ranges
        ages : the ages

    Returns:
        dict: A dictionary containing the calculated metrics:
            - 'SMR': Element-wise ratio of mat_model to mat_brut.
            - 'Chi-Squared': Chi-squared statistic.
            - 'MAPE': Mean Absolute Percentage Error.
            - 'R^2': Coefficient of determination.
            - 'Sign Statistic': Number of positive differences (mat_model > mat_brut).
    """
    if mat_model.shape != mat_brut.shape:
        raise ValueError("The model and brut matrices must have the same shape.")
    if mat_model.shape != mat_expo.shape:
        raise ValueError("The model and expo matrices must have the same shape.")
    smr = SMR(mat_model, mat_brut, mat_expo)
    smr_per_age = np.mean(smr, axis=0)
    smr_per_year = np.mean(smr, axis=1)
    smr_per_age_range = compute_average_smr(mat_brut, mat_model, mat_expo, age_ranges, ages)
    chi2_stat, chi2_p_value = chisquared(mat_model, mat_brut, mat_expo, nb_params=nb_params)
    sign_p_value = sign_test(mat_model, mat_brut)
    run_p_value = run_test(mat_model, mat_brut)
    metrics = {
        'SMR': {
            "per_age": list(smr_per_age),
            "per_year": list(smr_per_year),
            "per_age_range": smr_per_age_range,
            "global": np.sum(mat_model) / np.sum(mat_brut)
        },
        'Chi-Squared': float(chi2_stat),
        'Chi-Squared (p-value)': float(chi2_p_value),
        #'MAPE': float(MAPE(mat_model, mat_brut)),
        'R^2': float(R_squared(mat_model, mat_brut)),
        #'Sign Statistic': float(sign_stat),
        'Sign Statistic (p-value)': float(sign_p_value),
        'Run Statistic (p-value)': float(run_p_value)
    }

    return metrics

def set_y_limits(ax, metric):
    if metric in ["R^2", "Sign Statistic (p-value)"]:  # Replace with actual metric names that are between 0 and 1
        ax.set_ylim(0, 1)
    elif metric in ["MAPE"]:
        ax.set_ylim(0,100)
    elif metric in ["SMR"]:
        ax.axhline(1.0, linestyle="--", linewidth=0.8)
        ax.set_ylim(0.5,1.5)
    else:
        # For other metrics, you can set custom limits or leave it as default
        ax.set_ylim(ax.get_ylim())  # Keeps the current y-limits

In [None]:
names = ["IRL_F", "IRL_M", "UK_F", "UK_M"]
ages=range(35, 65+1)
age_ranges = [(35, 45), (46, 65)]
years=range(2015, 2022+1)

all_metrics = {}
for name in names:
    all_metrics[name] = {}
    input_mat = load_matrix(f"./matrices/BRUT_LISSES_{name}.csv", ages=ages, years=years)
    expo_mat = load_matrix(f"./matrices/BRUT_EXPO_{name}.csv", ages=ages, years=years)
    brass_mat = load_matrix(f"./matrices/BRASS_{name}_all.csv", ages=ages, years=years)
    bongaarts_mat = load_matrix(f"./matrices/BONGAARTS_{name}_all.csv", ages=ages, years=years)
    test_mat = np.ones(input_mat.shape)
    
    all_metrics[name]["BONGAARTS"] = calculate_metrics(bongaarts_mat, input_mat, expo_mat, age_ranges, ages, nb_params=4)
    all_metrics[name]["BRASS"] = calculate_metrics(brass_mat, input_mat, expo_mat, age_ranges, ages, nb_params=2)
    #all_metrics[name]["TEST"] = calculate_metrics(test_mat, input_mat, expo_mat, age_ranges, ages, nb_params=0)

    # Initialize an empty list to store the rows
    data = []

    # Iterate through the models and metrics as in the plot loop
    for model_name, metrics in all_metrics[name].items():
        for age_range, smr_values in metrics["SMR"]["per_age_range"].items():
            # Create a row for each combination of model and age range
            row = {
                "Model": model_name,
                "Tranche d'âges": age_range
            }
            # Add the SMR values for each year to the row
            for year, smr_value in zip(years, smr_values):
                row[year] = smr_value
            # Append the row to the data list
            data.append(row)

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Set the index to be the Model and Age Range
    df.set_index(["Model", "Tranche d'âges"], inplace=True)

    # Display the DataFrame
    display(df)

    # Plot the results
    plt.figure(figsize=(12, 6))

    # Plot sum column-wise
    plt.subplot(1, 2, 1)
    for model_name, metrics in all_metrics[name].items():
        plt.scatter(ages, metrics["SMR"]["per_age"], label=model_name)
    plt.title(f'SMR par âge {name}')
    plt.xlabel('Age')
    plt.ylabel('SMR')
    #plt.yscale('log')
    plt.axhline(1, color="red")

    # Plot sum row-wise
    plt.subplot(1, 2, 2)
    for model_name, metrics in all_metrics[name].items():
        plt.scatter(years, metrics["SMR"]["per_year"], label=model_name)
    plt.title(f'SMR par année {name}')
    plt.xlabel('Année')
    plt.ylabel('SMR')
    #plt.yscale('log')
    plt.axhline(1, color="red")

    plt.tight_layout()
    
    plt.legend()
    plt.show()


metrics_records = []

for name, models_metrics in all_metrics.items():
    for model, metrics in models_metrics.items():
        record = metrics
        record["SMR"] = record["SMR"]["global"]

        for metric_name, metric_value in record.items():
            metrics_records.append(
                {
                    "metric": metric_name,
                    "value": metric_value,
                    "portfolio": name,
                    "model": model
                }
            )

metrics_df = pd.DataFrame(metrics_records)


# # Set the style for the plot
# sns.set(style="whitegrid")

# # Initialize a grid of plots with an Axes for each walk
# grid = sns.FacetGrid(metrics_df, col="metric", row="portfolio", hue="metric", palette="tab20c", height=3, aspect=1.5, sharey=False)

# # Draw a line plot to connect the points for each model
# grid.map_dataframe(sns.lineplot, x="model", y="value", marker="o")

# # Rotate x labels for better readability
# # grid.set_xticklabels(rotation=45, ha="right")

# # Adjust the layout to ensure everything fits
# plt.tight_layout()

# # Add axis labels and titles
# # grid.set_axis_labels("Model", "Value")
# grid.set_titles(row_template="{row_name}", col_template="{col_name}")

# # Adjust top space to fit the main title
# plt.subplots_adjust(top=0.9)
# grid.fig.suptitle('Model Performance Comparison Across Portfolios', fontsize=16)

# # Ensure x-axis labels are not being cut off
# for ax in grid.axes.flat:
#     for label in ax.get_xticklabels():
#         label.set_rotation(45)
#         label.set_horizontalalignment('right')

# # Loop through each axis in the grid and apply custom y-limits
# for ax, (row_val, col_val) in zip(grid.axes.flat, grid.axes_dict.keys()):
#     metric_name = col_val  # Get the metric name
#     set_y_limits(ax, metric_name)

# plt.show()

# Create a pivot table with Portfolio and Builder as index, years as columns, and Déviation as values
pivot_table = metrics_df.pivot_table(index=["portfolio", "metric"], columns="model", values="value")

pivot_table = pivot_table.applymap(lambda x: f"{x:.3f}")

# Convert the pivot table to an HTML table
html_table = pivot_table.to_html()

# Convert the pivot table to a LaTeX table
latex_table = pivot_table.to_latex(escape=False)

# Save the LaTeX table to a file (optional)
with open("./tables/comparaison_modeles.tex", "w") as file:
    file.write(latex_table)

display(HTML(html_table))

In [None]:
names = ["IRL_F", "IRL_M", "UK_F", "UK_M"]
ages=range(35, 65+1)
years=range(2015, 2022+1)
age_ranges = [(35, 45), (46, 65)]

all_metrics = {}
for name in names:
    all_metrics[name] = {}
    input_mat = load_matrix(f"./matrices/BRUT_{name}.csv", ages=ages, years=years)
    expo_mat = load_matrix(f"./matrices/BRUT_EXPO_{name}.csv", ages=ages, years=years)
    smoothed_mat = load_matrix(f"./matrices/BRUT_LISSES_{name}.csv", ages=ages, years=years)
    
    all_metrics[name]["smoothing"] = calculate_metrics(smoothed_mat, input_mat, expo_mat, age_ranges, ages, nb_params=0)
    print(all_metrics)

    # Initialize an empty list to store the rows
    data = []

    # Iterate through the models and metrics as in the plot loop
    for model_name, metrics in all_metrics[name].items():
        for age_range, smr_values in metrics["SMR"]["per_age_range"].items():
            # Create a row for each combination of model and age range
            row = {
                "Model": model_name,
                "Tranche d'âges": age_range
            }
            # Add the SMR values for each year to the row
            for year, smr_value in zip(years, smr_values):
                row[year] = smr_value
            # Append the row to the data list
            data.append(row)

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Set the index to be the Model and Age Range
    df.set_index(["Model", "Tranche d'âges"], inplace=True)

    # Display the DataFrame
    display(df)

    # Plot the results
    plt.figure(figsize=(12, 6))

    # Plot sum column-wise
    plt.subplot(1, 2, 1)
    for model_name, metrics in all_metrics[name].items():
        plt.scatter(ages, metrics["SMR"]["per_age"], label=model_name)
    plt.title(f'SMR par âge {name}')
    plt.xlabel('Age')
    plt.ylabel('SMR')
    #plt.yscale('log')
    plt.axhline(1, color="red")

    # Plot sum row-wise
    plt.subplot(1, 2, 2)
    for model_name, metrics in all_metrics[name].items():
        plt.scatter(years, metrics["SMR"]["per_year"], label=model_name)
    plt.title(f'SMR par année {name}')
    plt.xlabel('Année')
    plt.ylabel('SMR')
    #plt.yscale('log')
    plt.axhline(1, color="red")

    plt.tight_layout()
    
    plt.legend()
    plt.show()


metrics_records = []

for name, models_metrics in all_metrics.items():
    for model, metrics in models_metrics.items():
        record = metrics
        record["SMR"] = record["SMR"]["global"]

        for metric_name, metric_value in record.items():
            metrics_records.append(
                {
                    "metric": metric_name,
                    "value": metric_value,
                    "portfolio": name,
                    "model": model
                }
            )

metrics_df = pd.DataFrame(metrics_records)

# Create a pivot table with Portfolio and Builder as index, years as columns, and Déviation as values
pivot_table = metrics_df.pivot_table(index=["portfolio", "metric"], columns="model", values="value")

pivot_table = pivot_table.applymap(lambda x: f"{x:.3f}")

# Convert the pivot table to an HTML table
html_table = pivot_table.to_html()

# Convert the pivot table to a LaTeX table
latex_table = pivot_table.to_latex(escape=False)

# Save the LaTeX table to a file (optional)
with open("./tables/smoothing_stats.tex", "w") as file:
    file.write(latex_table)

display(HTML(html_table))