In [53]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ks_2samp, wasserstein_distance

def hellinger(p, q):
    """Calculate the Hellinger distance between two probability distributions."""
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q))**2)) / np.sqrt(2)

# Load the original dataset
original_path = '/Users/guillermodominguez/Documents/Imperial College London/Dissertation/rPSMF/ExperimentImpute/data/sp500_closing_prices_masked_20percent.csv'
df_original = pd.read_csv(original_path, header=None)

# Load the imputed dataset from PSMF
psmf_path = '/Users/guillermodominguez/Documents/Imperial College London/Dissertation/rPSMF/Y_imputed_20_PSMF.csv'
df_imputed = pd.read_csv(psmf_path, header=None)

# Only replace missing values in the original dataset with values from the imputed dataset
df_final = df_original.copy()
mask = df_original.isna()  # Create a boolean mask for where NaN values are in the original dataframe
df_final[mask] = df_imputed[mask]  # Replace only the NaN values

# Impute missing values with the mean of each row
df_mean_imputed = df_original.copy()

# Apply row-wise mean imputation
for index, row in df_mean_imputed.iterrows():
    # Calculate the mean of the current row excluding NaNs
    row_mean = row.mean()
    # Fill NaNs in this row with the calculated mean
    df_mean_imputed.iloc[index] = row.fillna(row_mean)

# Find rows that contain at least one NaN
rows_with_nan = mask.any(axis=1)

# Get indices of these rows
rows_with_nan_indices = rows_with_nan[rows_with_nan].index.tolist()

# Initialize lists to store results for plotting
ks_stats_psmf = []
ks_pvalues_psmf = []
ks_stats_mean = []
ks_pvalues_mean = []
wasserstein_psmf_list = []
wasserstein_mean_list = []
hellinger_psmf_list = []
hellinger_mean_list = []

for idx, stock_row in enumerate(rows_with_nan_indices, start=1):
    
    # Drop NA values for the original for a fair comparison
    original_clean = df_original.iloc[stock_row].dropna()

    # Normalize the data for Hellinger distance calculation
    original_hist, _ = np.histogram(original_clean, bins=50, density=True)
    psmf_hist, _ = np.histogram(df_final.iloc[stock_row].dropna(), bins=50, density=True)
    mean_hist, _ = np.histogram(df_mean_imputed.iloc[stock_row].dropna(), bins=50, density=True)
    
    # Calculate the Hellinger distance
    hellinger_psmf = hellinger(original_hist, psmf_hist)
    hellinger_mean = hellinger(original_hist, mean_hist)
    
    # Append Hellinger distances to lists
    hellinger_psmf_list.append(hellinger_psmf)
    hellinger_mean_list.append(hellinger_mean)

    # KS Test between original and PSMF imputed
    ks_stat_psmf, ks_pvalue_psmf = ks_2samp(original_clean, df_final.iloc[stock_row].dropna())
    ks_stats_psmf.append(ks_stat_psmf)
    ks_pvalues_psmf.append(ks_pvalue_psmf)

    # KS Test between original and Mean imputed
    ks_stat_mean, ks_pvalue_mean = ks_2samp(original_clean, df_mean_imputed.iloc[stock_row].dropna())
    ks_stats_mean.append(ks_stat_mean)
    ks_pvalues_mean.append(ks_pvalue_mean)

    # Wasserstein Distance between original and PSMF imputed
    wasserstein_psmf = wasserstein_distance(original_clean, df_final.iloc[stock_row].dropna())
    wasserstein_psmf_list.append(wasserstein_psmf)

    # Wasserstein Distance between original and Mean imputed
    wasserstein_mean = wasserstein_distance(original_clean, df_mean_imputed.iloc[stock_row].dropna())
    wasserstein_mean_list.append(wasserstein_mean)
    
    # Plotting individual distributions
    plt.figure(figsize=(18, 6))

    # Original data distribution
    plt.subplot(1, 3, 1)
    df_original.iloc[stock_row].dropna().hist(bins=50, alpha=0.7, label='Original')
    plt.title(f'Original Data Distribution for Stock {idx}')
    plt.legend()

    # Distribution after PSMF imputation
    plt.subplot(1, 3, 2)
    df_final.iloc[stock_row].hist(bins=50, alpha=0.7, color='orange', label='PSMF Imputed')
    plt.title(f'Imputed Data Distribution for Stock {idx}')
    plt.legend()

    # Distribution after mean imputation
    plt.subplot(1, 3, 3)
    df_mean_imputed.iloc[stock_row].hist(bins=50, alpha=0.7, color='green', label='Mean Imputed')
    plt.title(f'Mean Imputed Data Distribution for Stock {idx}')
    plt.legend()

    #plt.show()

    # Print KS, Wasserstein, and Hellinger results for this stock
    print(f"Stock {idx} - KS Statistic PSMF: {ks_stat_psmf}, P-value PSMF: {ks_pvalue_psmf}")
    print(f"Stock {idx} - KS Statistic Mean: {ks_stat_mean}, P-value Mean: {ks_pvalue_mean}")
    print(f"Stock {idx} - Wasserstein Distance PSMF: {wasserstein_psmf}")
    print(f"Stock {idx} - Wasserstein Distance Mean: {wasserstein_mean}")
    print(f"Stock {idx} - Hellinger Distance PSMF: {hellinger_psmf}")
    print(f"Stock {idx} - Hellinger Distance Mean: {hellinger_mean}")
    print("\n" + "="*80 + "\n")

# Calculate means for the Wasserstein distances, KS statistics, and Hellinger distances
mean_wasserstein_psmf = sum(wasserstein_psmf_list) / len(wasserstein_psmf_list)
mean_wasserstein_mean = sum(wasserstein_mean_list) / len(wasserstein_mean_list)

mean_ks_stat_psmf = sum(ks_stats_psmf) / len(ks_stats_psmf)
mean_ks_stat_mean = sum(ks_stats_mean) / len(ks_stats_mean)

mean_hellinger_psmf = sum(hellinger_psmf_list) / len(hellinger_psmf_list)
mean_hellinger_mean = sum(hellinger_mean_list) / len(hellinger_mean_list)

# Plot Wasserstein distances with ordinal stock number and mean lines
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(wasserstein_psmf_list) + 1), wasserstein_psmf_list, label='PSMF Imputed', alpha=0.7, color='orange')
plt.plot(range(1, len(wasserstein_mean_list) + 1), wasserstein_mean_list, label='Mean Imputed', alpha=0.7, color='green')
plt.axhline(mean_wasserstein_psmf, color='orange', linestyle='--', label='Mean PSMF')
plt.axhline(mean_wasserstein_mean, color='green', linestyle='--', label='Mean Imputed')
plt.xlabel('Stock Number (Ordinal)')
plt.ylabel('Wasserstein Distance')
plt.title('Wasserstein Distance Comparison for Mean and PSMF Imputed Data')
plt.legend()
#plt.show()

# Plot KS statistics with ordinal stock number and mean lines
plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
plt.plot(range(1, len(ks_stats_psmf) + 1), ks_stats_psmf, label='KS Statistic PSMF', alpha=0.7, color='orange')
plt.plot(range(1, len(ks_stats_mean) + 1), ks_stats_mean, label='KS Statistic Mean', alpha=0.7, color='green')
plt.axhline(mean_ks_stat_psmf, color='orange', linestyle='--', label='Mean KS Statistic PSMF')
plt.axhline(mean_ks_stat_mean, color='green', linestyle='--', label='Mean KS Statistic Mean')
plt.xlabel('Stock Number (Ordinal)')
plt.ylabel('KS Statistic')
plt.title('KS Statistic Comparison for Mean and PSMF Imputed Data')
plt.legend()

# Plot P-values for KS tests with ordinal stock number and mean lines
plt.subplot(2, 1, 2)
plt.plot(range(1, len(ks_pvalues_psmf) + 1), ks_pvalues_psmf, label='P-value PSMF', alpha=0.7, color='orange')
plt.plot(range(1, len(ks_pvalues_mean) + 1), ks_pvalues_mean, label='P-value Mean', alpha=0.7, color='green')
plt.axhline(0.05, color='red', linestyle='--', label='Significance Level (0.05)')
plt.xlabel('Stock Number (Ordinal)')
plt.ylabel('P-value')
plt.title('P-value Comparison for Mean and PSMF Imputed Data')
plt.legend()
plt.tight_layout()
#plt.show()

# Plot Hellinger distances with ordinal stock number and mean lines
plt.figure(figsize=(12, 6))
plt.plot(range(1, len(hellinger_psmf_list) + 1), hellinger_psmf_list, label='PSMF Imputed', alpha=0.7, color='orange')
plt.plot(range(1, len(hellinger_mean_list) + 1), hellinger_mean_list, label='Mean Imputed', alpha=0.7, color='green')
plt.axhline(mean_hellinger_psmf, color='orange', linestyle='--', label='Mean PSMF')
plt.axhline(mean_hellinger_mean, color='green', linestyle='--', label='Mean Imputed')
plt.xlabel('Stock Number (Ordinal)')
plt.ylabel('Hellinger Distance')
plt.title('Hellinger Distance Comparison for Mean and PSMF Imputed Data')
plt.legend()
#plt.show()



Stock 1 - KS Statistic PSMF: 0.0600745587305929, P-value PSMF: 0.029960961197342838
Stock 1 - KS Statistic Mean: 0.09745086089709437, P-value Mean: 3.301516589056364e-05
Stock 1 - Wasserstein Distance PSMF: 1.233841302963075
Stock 1 - Wasserstein Distance Mean: 1.4609529017179708
Stock 1 - Hellinger Distance PSMF: 0.06402207206804095
Stock 1 - Hellinger Distance Mean: 0.19776024478209545


Stock 2 - KS Statistic PSMF: 0.036623252524625366, P-value PSMF: 0.41352540563544743
Stock 2 - KS Statistic Mean: 0.08522811164761478, P-value Mean: 0.00045332320275818535
Stock 2 - Wasserstein Distance PSMF: 1.5752919523206765
Stock 2 - Wasserstein Distance Mean: 4.16556576636635
Stock 2 - Hellinger Distance PSMF: 0.0722992262965
Stock 2 - Hellinger Distance Mean: 0.11465919429416821


Stock 3 - KS Statistic PSMF: 0.03539091070707811, P-value PSMF: 0.4594592167259047
Stock 3 - KS Statistic Mean: 0.09614083116451082, P-value Mean: 4.8362856288817036e-05
Stock 3 - Wasserstein Distance PSMF: 1.50610607

  plt.figure(figsize=(18, 6))


Stock 23 - KS Statistic PSMF: 0.025322146564155556, P-value PSMF: 0.848213411194974
Stock 23 - KS Statistic Mean: 0.1227318645557809, P-value Mean: 6.658913481621955e-08
Stock 23 - Wasserstein Distance PSMF: 0.25954323111480293
Stock 23 - Wasserstein Distance Mean: 0.8637778102291649
Stock 23 - Hellinger Distance PSMF: 0.12552574117633197
Stock 23 - Hellinger Distance Mean: 0.2142903902589818


Stock 24 - KS Statistic PSMF: 0.023242853571704327, P-value PSMF: 0.9085959898950895
Stock 24 - KS Statistic Mean: 0.09350959902943275, P-value Mean: 8.663527323136035e-05
Stock 24 - Wasserstein Distance PSMF: 0.5484508831705693
Stock 24 - Wasserstein Distance Mean: 1.4848874620464134
Stock 24 - Hellinger Distance PSMF: 0.06668521405993724
Stock 24 - Hellinger Distance Mean: 0.2472992311723849


Stock 25 - KS Statistic PSMF: 0.05337442794729394, P-value PSMF: 0.07931555619050552
Stock 25 - KS Statistic Mean: 0.1267048734653862, P-value Mean: 2.824631801796528e-08
Stock 25 - Wasserstein Distance 

<matplotlib.legend.Legend at 0x3e4d000d0>

Error in callback <function _draw_all_if_interactive at 0x144526340> (for post_execute):


KeyboardInterrupt: 

In [66]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp, wasserstein_distance

def hellinger(p, q):
    """Calculate the Hellinger distance between two probability distributions."""
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q))**2)) / np.sqrt(2)

# Load the original dataset
original_path = '/Users/guillermodominguez/Documents/Imperial College London/Dissertation/rPSMF/ExperimentImpute/data/sp500_closing_prices_masked_0percent.csv'
df_original = pd.read_csv(original_path, header=None)

# Load the imputed dataset from PSMF
psmf_path = '/Users/guillermodominguez/Documents/Imperial College London/Dissertation/rPSMF/Y_imputed_0_PSMF.csv'
df_imputed = pd.read_csv(psmf_path, header=None)

# Only replace missing values in the original dataset with values from the imputed dataset
df_final = df_original.copy()
mask = df_original.isna()  # Create a boolean mask for where NaN values are in the original dataframe
df_final[mask] = df_imputed[mask]  # Replace only the NaN values

# Impute missing values with the mean of each row
df_mean_imputed = df_original.copy()

# Apply row-wise mean imputation
for index, row in df_mean_imputed.iterrows():
    # Calculate the mean of the current row excluding NaNs
    row_mean = row.mean()
    # Fill NaNs in this row with the calculated mean
    df_mean_imputed.iloc[index] = row.fillna(row_mean)

# Find rows that contain at least one NaN
rows_with_nan = mask.any(axis=1)

# Get indices of these rows
rows_with_nan_indices = rows_with_nan[rows_with_nan].index.tolist()

# Initialize lists to store results
ks_stats_psmf = []
ks_pvalues_psmf = []
ks_stats_mean = []
ks_pvalues_mean = []
wasserstein_psmf_list = []
wasserstein_mean_list = []
hellinger_psmf_list = []
hellinger_mean_list = []

for idx, stock_row in enumerate(rows_with_nan_indices, start=1):
    
    # Drop NA values for the original for a fair comparison
    original_clean = df_original.iloc[stock_row].dropna()

    # Normalize the data for Hellinger distance calculation
    original_hist, _ = np.histogram(original_clean, bins=50, density=True)
    psmf_hist, _ = np.histogram(df_final.iloc[stock_row].dropna(), bins=50, density=True)
    mean_hist, _ = np.histogram(df_mean_imputed.iloc[stock_row].dropna(), bins=50, density=True)
    
    # Calculate the Hellinger distance
    hellinger_psmf = hellinger(original_hist, psmf_hist)
    hellinger_mean = hellinger(original_hist, mean_hist)
    
    # Append Hellinger distances to lists
    hellinger_psmf_list.append(hellinger_psmf)
    hellinger_mean_list.append(hellinger_mean)

    # KS Test between original and PSMF imputed
    ks_stat_psmf, ks_pvalue_psmf = ks_2samp(original_clean, df_final.iloc[stock_row].dropna())
    ks_stats_psmf.append(ks_stat_psmf)
    ks_pvalues_psmf.append(ks_pvalue_psmf)

    # KS Test between original and Mean imputed
    ks_stat_mean, ks_pvalue_mean = ks_2samp(original_clean, df_mean_imputed.iloc[stock_row].dropna())
    ks_stats_mean.append(ks_stat_mean)
    ks_pvalues_mean.append(ks_pvalue_mean)

    # Wasserstein Distance between original and PSMF imputed
    wasserstein_psmf = wasserstein_distance(original_clean, df_final.iloc[stock_row].dropna())
    wasserstein_psmf_list.append(wasserstein_psmf)

    # Wasserstein Distance between original and Mean imputed
    wasserstein_mean = wasserstein_distance(original_clean, df_mean_imputed.iloc[stock_row].dropna())
    wasserstein_mean_list.append(wasserstein_mean)
    
    # Print KS, Wasserstein, and Hellinger results for this stock
    print(f"Stock {idx} - KS Statistic PSMF: {ks_stat_psmf}, P-value PSMF: {ks_pvalue_psmf}")
    print(f"Stock {idx} - KS Statistic Mean: {ks_stat_mean}, P-value Mean: {ks_pvalue_mean}")
    print(f"Stock {idx} - Wasserstein Distance PSMF: {wasserstein_psmf}")
    print(f"Stock {idx} - Wasserstein Distance Mean: {wasserstein_mean}")
    print(f"Stock {idx} - Hellinger Distance PSMF: {hellinger_psmf}")
    print(f"Stock {idx} - Hellinger Distance Mean: {hellinger_mean}")


Stock 1 - KS Statistic PSMF: 0.1556791104050834, P-value PSMF: 1.1575335509452793e-12
Stock 1 - KS Statistic Mean: 0.08186700161471461, P-value Mean: 0.0008168855377024989
Stock 1 - Wasserstein Distance PSMF: 5.524835336394652
Stock 1 - Wasserstein Distance Mean: 1.413084662895336
Stock 1 - Hellinger Distance PSMF: 0.342301304078365
Stock 1 - Hellinger Distance Mean: 0.1761931487459466
Stock 2 - KS Statistic PSMF: 0.9650516282764099, P-value PSMF: 2.547142761140573e-57
Stock 2 - KS Statistic Mean: 0.5044588056899415, P-value Mean: 1.5901663749380204e-10
Stock 2 - Wasserstein Distance PSMF: 52.21255989350877
Stock 2 - Wasserstein Distance Mean: 3.7290970713343268
Stock 2 - Hellinger Distance PSMF: 1.2173987570534823
Stock 2 - Hellinger Distance Mean: 1.770414754750674
Stock 3 - KS Statistic PSMF: 0.39537983858873454, P-value PSMF: 1.4906780115091178e-18
Stock 3 - KS Statistic Mean: 0.5020967912151391, P-value Mean: 2.652383191858639e-30
Stock 3 - Wasserstein Distance PSMF: 9.47380230560

In [67]:
# Calculate overall means for the Wasserstein distances, KS statistics, and Hellinger distances
mean_wasserstein_psmf = np.mean(wasserstein_psmf_list)
mean_wasserstein_mean = np.mean(wasserstein_mean_list)

mean_ks_stat_psmf = np.mean(ks_stats_psmf)
mean_ks_stat_mean = np.mean(ks_stats_mean)

mean_hellinger_psmf = np.mean(hellinger_psmf_list)
mean_hellinger_mean = np.mean(hellinger_mean_list)

# Print the overall means
print(f"Overall Mean Wasserstein Distance for PSMF Imputation: {mean_wasserstein_psmf}")
print(f"Overall Mean Wasserstein Distance for Mean Imputation: {mean_wasserstein_mean}")

print(f"Overall Mean KS Statistic for PSMF Imputation: {mean_ks_stat_psmf}")
print(f"Overall Mean KS Statistic for Mean Imputation: {mean_ks_stat_mean}")

print(f"Overall Mean Hellinger Distance for PSMF Imputation: {mean_hellinger_psmf}")
print(f"Overall Mean Hellinger Distance for Mean Imputation: {mean_hellinger_mean}")


Overall Mean Wasserstein Distance for PSMF Imputation: 15.088536305789454
Overall Mean Wasserstein Distance for Mean Imputation: 3.0838310815502257
Overall Mean KS Statistic for PSMF Imputation: 0.2941856826220132
Overall Mean KS Statistic for Mean Imputation: 0.21420625522291223
Overall Mean Hellinger Distance for PSMF Imputation: 0.47220652988965356
Overall Mean Hellinger Distance for Mean Imputation: 0.5091002034776553


In [45]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr

# Paths to the CSV files
original_path = '/Users/guillermodominguez/Documents/Imperial College London/Dissertation/rPSMF/ExperimentImpute/data/cleaned_sp500_stocks_matrix.csv'
psmf_path = '/Users/guillermodominguez/Documents/Imperial College London/Dissertation/rPSMF/Y_imputed_CLEANED_PSMF.csv'
masked_path = '/Users/guillermodominguez/Documents/Imperial College London/Dissertation/rPSMF/ExperimentImpute/data/cleaned_sp500_stocks_matrix_masked_20percent.csv'

# Load the datasets
df_original = pd.read_csv(original_path, header=None)
df_psmf = pd.read_csv(psmf_path, header=None)
df_masked = pd.read_csv(masked_path, header=None)

# Convert all data to numeric, setting errors='coerce' to handle non-numeric values
df_original = df_original.apply(pd.to_numeric, errors='coerce')
df_psmf = df_psmf.apply(pd.to_numeric, errors='coerce')
df_masked = df_masked.apply(pd.to_numeric, errors='coerce')

# Create a mask of the missing values
mask = df_masked.isna()

# Impute missing values using mean imputation
df_mean_imputed = df_masked.copy()
for index, row in df_mean_imputed.iterrows():
    row_mean = row.mean()
    df_mean_imputed.iloc[index] = row.fillna(row_mean)

# Extract only the values that were originally missing
original_missing_values = df_original[mask].values
psmf_imputed_values = df_psmf[mask].values
mean_imputed_values = df_mean_imputed[mask].values

# Ensure no NaN values are present in the extracted values
original_missing_values = original_missing_values[~np.isnan(original_missing_values)]
psmf_imputed_values = psmf_imputed_values[~np.isnan(psmf_imputed_values)]
mean_imputed_values = mean_imputed_values[~np.isnan(mean_imputed_values)]

# Calculate metrics for PSMF Imputation
rmse_psmf = np.sqrt(mean_squared_error(original_missing_values, psmf_imputed_values))
mae_psmf = mean_absolute_error(original_missing_values, psmf_imputed_values)
mape_psmf = np.mean(np.abs((original_missing_values - psmf_imputed_values) / original_missing_values)) * 100
pearson_corr_psmf, _ = pearsonr(original_missing_values, psmf_imputed_values)
spearman_corr_psmf, _ = spearmanr(original_missing_values, psmf_imputed_values)

# Calculate metrics for Mean Imputation
rmse_mean = np.sqrt(mean_squared_error(original_missing_values, mean_imputed_values))
mae_mean = mean_absolute_error(original_missing_values, mean_imputed_values)
mape_mean = np.mean(np.abs((original_missing_values - mean_imputed_values) / original_missing_values)) * 100
pearson_corr_mean, _ = pearsonr(original_missing_values, mean_imputed_values)
spearman_corr_mean, _ = spearmanr(original_missing_values, mean_imputed_values)

# Print results
print("Metrics for PSMF Imputation:")
print(f"RMSE: {rmse_psmf}")
print(f"MAE: {mae_psmf}")
print(f"MAPE: {mape_psmf}%")
print(f"Pearson Correlation Coefficient: {pearson_corr_psmf}")
print(f"Spearman Correlation Coefficient: {spearman_corr_psmf}")
print("\nMetrics for Mean Imputation:")
print(f"RMSE: {rmse_mean}")
print(f"MAE: {mae_mean}")
print(f"MAPE: {mape_mean}%")
print(f"Pearson Correlation Coefficient: {pearson_corr_mean}")
print(f"Spearman Correlation Coefficient: {spearman_corr_mean}")


Metrics for PSMF Imputation:
RMSE: 10.204271559122764
MAE: 4.060511238907783
MAPE: 5.076612599382808%
Pearson Correlation Coefficient: 0.995066196064281
Spearman Correlation Coefficient: 0.9938551697528454

Metrics for Mean Imputation:
RMSE: 32.582225845093646
MAE: 15.63017312831459
MAPE: 20.188872270930148%
Pearson Correlation Coefficient: 0.9460345200036698
Spearman Correlation Coefficient: 0.9237285820830474


In [None]:
RMSE, MAE, MAPE, Correlation Coefficient (Pearson and Spearman), Normalized Mutual information