In [2]:
from scipy.stats import ttest_rel, ttest_ind, wilcoxon

In [3]:
import numpy as np
# Sample data - RMSE scores for original and replica models
pls_original = np.array([4.33, 0.94, 2.85, 2.01, 1.06, 2.65, 0.62, 0.72])
pls_replica = np.array([5.81, 0.47, 1.94, 4.35, 1.17, 1.43, 0.66, 0.72])

ica_original = np.array([8.31, 1.44, 4.77, 5.17, 4.08, 3.07, 2.29, 0.98])
ica_replica = np.array([10.68, 0.63, 5.55, 8.30, 2.90, 3.52, 1.72, 1.37])

moc_original = np.array([5.30, 1.03, 3.47, 2.31, 2.21, 2.72, 0.62, 0.82])
moc_replica = np.array([7.29, 0.49, 2.39, 5.21, 1.67, 1.81, 1.10, 1.09])

In [4]:
def perform_student_t_test(original, replica):
    t_stat, p_value = ttest_rel(original, replica)
    return t_stat, p_value


print(f"PLS Student's t-test: {perform_student_t_test(pls_original, pls_replica)}")
print(f"ICA Student's t-test: {perform_student_t_test(ica_original, ica_replica)}")
print(f"MOC Student's t-test: {perform_student_t_test(moc_original, moc_replica)}")

PLS Student's t-test: (-0.4056067802087878, 0.6971426466917101)
ICA Student's t-test: (-1.0613757495947185, 0.32376000700705115)
MOC Student's t-test: (-0.6329536381030276, 0.5468768019604969)


In [5]:
def perform_welchs_test(original, replica):
    t_stat, p_value = ttest_ind(original, replica, equal_var=False)
    return t_stat, p_value


print(f"PLS1-SM Welch's: {perform_welchs_test(pls_original, pls_replica)}")
print(f"ICA Welch's: {perform_welchs_test(ica_original, ica_replica)}")
print(f"MOC Welch's: {perform_welchs_test(moc_original, moc_replica)}")

PLS1-SM Welch's: (-0.2056477264884844, 0.8404483397001741)
ICA Welch's: (-0.3754140305999682, 0.7138111860718926)
MOC Welch's: (-0.3198421165467493, 0.7545270443016014)


In [6]:
def perform_wilcoxon_test(original, replica):
    w_stat, p_value = wilcoxon(original, replica)
    return w_stat, p_value


print(f"PLS1-SM Wilcoxon: {perform_wilcoxon_test(pls_original, pls_replica)}")
print(f"ICA Wilcoxon: {perform_wilcoxon_test(ica_original, ica_replica)}")
print(f"MOC Wilcoxon: {perform_wilcoxon_test(moc_original, moc_replica)}")

PLS1-SM Wilcoxon: (12.0, 0.7353166906373405)
ICA Wilcoxon: (14.0, 0.640625)
MOC Wilcoxon: (18.0, 1.0)


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [7]:
import numpy as np
from scipy.special import gamma
import scipy.stats as stats


# Define the RMSE function
def rmse(y_actual, y_predicted):
    return np.sqrt(np.mean((y_actual - y_predicted) ** 2))


def variance_of_rmse(rmse_value, n):
    gam_upper = 2 * (gamma(n / 2) ** 2)
    gam_lower = gamma((n - 1) / 2) ** 2

    correction_factor = n - 1 - gam_upper / gam_lower
    variance = ((rmse_value**2) / n) * correction_factor

    return variance

# Define the t-test function
def t_test(rmse_a, rmse_b, variance_a, variance_b):
    return (rmse_a - rmse_b) / np.sqrt(variance_a + variance_b)


def calculate_degrees_of_freedom(S2_RMSEA, S2_RMSEB, n_A, n_B):
    numerator = S2_RMSEA + S2_RMSEB
    denominator = (S2_RMSEA**2 / (n_A - 1)) + (S2_RMSEB**2 / (n_B - 1))

    f_statistic = (numerator**2) / denominator

    return f_statistic

In [8]:
# Function to perform the t-test analysis correctly comparing two datasets
def analyze(original, replica):
    # Calculate the variance of the RMSE for both original and replica
    variance_original = variance_of_rmse(original, len(original))
    variance_replica = variance_of_rmse(replica, len(replica))

    # Calculate the t-test value
    t_value = t_test(original, replica, variance_original, variance_replica)

    # Calculate degrees of freedom
    degrees_of_freedom = calculate_degrees_of_freedom(variance_original, variance_replica, len(original), len(replica))

    # Calculate the two-tailed p-value using the t-test value and degrees of freedom
    p_value = stats.t.sf(np.abs(t_value), degrees_of_freedom) * 2

    return {
        # "var_rep": variance_replica,
        "dof": degrees_of_freedom,
        "t_value": t_value,
        "p_value": p_value,
    }

# PLS1-SM
PLS_t_test = analyze(pls_original, pls_replica)['p_value']
# ICA
ICA_t_test = analyze(ica_original, ica_replica)['p_value']
# MOC
MOC_t_test = analyze(moc_original, moc_replica)['p_value']

In [9]:
PLS_t_test, ICA_t_test, MOC_t_test

(array([0.41996941, 0.09730727, 0.30232697, 0.07484827, 0.78049768,
        0.12733076, 0.85963113, 1.        ]),
 array([0.48751103, 0.06308855, 0.67057514, 0.21680435, 0.35440612,
        0.70040829, 0.43161384, 0.36270038]),
 array([0.38453993, 0.08229403, 0.3157863 , 0.06572211, 0.44084576,
        0.27785651, 0.1492513 , 0.4340357 ]))

In [10]:
# multiply by 100 to get percentage and round to 2 decimal places
PLS_t_test = np.around(PLS_t_test * 100, 2)
ICA_t_test = np.around(ICA_t_test * 100, 2)
MOC_t_test = np.around(MOC_t_test * 100, 2)

print(f"PLS1-SM t-test: {PLS_t_test}%")
print(f"ICA t-test: {ICA_t_test}%")
print(f"MOC t-test: {MOC_t_test}%")

PLS1-SM t-test: [ 42.     9.73  30.23   7.48  78.05  12.73  85.96 100.  ]%
ICA t-test: [48.75  6.31 67.06 21.68 35.44 70.04 43.16 36.27]%
MOC t-test: [38.45  8.23 31.58  6.57 44.08 27.79 14.93 43.4 ]%


In [26]:
ICA_MAD = np.array([8.64, 0.53, 3.69, 7.07, 2.1 , 4.  , 1.45, 1.15])
MOC_MAD = np.array([6.51, 0.44, 2.06, 4.93, 1.26, 2.12, 0.91, 0.97])

# ICA_MAD
ICA_MAD_t_test = analyze(ica_original, ICA_MAD)['p_value'] * 100
# MOC_MAD
MOC_MAD_t_test = analyze(moc_original, MOC_MAD)['p_value'] * 100

# put in dataframe against original
import pandas as pd
from lib.reproduction import major_oxides

def percentify(arr):
    return [f"{val:.2f}%" for val in arr]

pd.DataFrame({
    "ICA (replica)": percentify(ICA_t_test),
    "ICA (MAD)": percentify(ICA_MAD_t_test),
    "ICA difference": percentify(ICA_t_test - ICA_MAD_t_test),
    "MOC (replica)": percentify(MOC_t_test),
    "MOC (MAD)": percentify(MOC_MAD_t_test),
    "MOC difference": percentify(MOC_t_test - MOC_MAD_t_test),
}, index=major_oxides)

Unnamed: 0,ICA (replica),ICA (MAD),ICA difference,MOC (replica),MOC (MAD),MOC difference
SiO2,48.75%,91.22%,-42.47%,38.45%,56.64%,-18.19%
TiO2,6.31%,3.91%,2.40%,8.23%,5.87%,2.36%
Al2O3,67.06%,47.80%,19.26%,31.58%,18.10%,13.48%
FeOT,21.68%,39.26%,-17.58%,6.57%,7.83%,-1.26%
MgO,35.44%,10.76%,24.68%,44.08%,15.57%,28.51%
CaO,70.04%,46.52%,23.52%,27.79%,49.03%,-21.24%
Na2O,43.16%,23.06%,20.10%,14.93%,30.33%,-15.40%
K2O,36.27%,65.36%,-29.09%,43.40%,63.77%,-20.37%
