In [1]:
import pandas as pd
import numpy as np
import hvplot.pandas
from scipy.spatial import procrustes

In [2]:
# resampling + axis norm

common_grid = np.linspace(0, 1, 100)
df_dr = pd.read_hdf("output/meta-llama_Meta-Llama-3.1-8B-Instruct/d_r.h5")
layers = len(df_dr)
original_depths = np.linspace(0, 1, layers)
# display(df_dr)

dr = df_dr[['Simple Addition (1)']]
display(dr.hvplot.scatter())

dr_np = dr.to_numpy().flatten()
resampled_dr = pd.DataFrame(np.interp(common_grid, original_depths, dr_np))
display(resampled_dr.hvplot.scatter())



In [3]:
# display(df_dr)
dr2 = df_dr[['Advanced Logic Grid (9)']]
display(dr2.hvplot.scatter())

dr2_np = dr2.to_numpy().flatten()

resampled_dr2 = pd.DataFrame(np.interp(common_grid, original_depths, dr2_np))
display(resampled_dr2.hvplot.scatter())

In [4]:
m1, m2, observed_disparity = procrustes(resampled_dr.to_numpy(), resampled_dr2.to_numpy())
display(m1, m2, observed_disparity)

array([[-0.21660027],
       [-0.17385818],
       [-0.13111609],
       [-0.088374  ],
       [-0.07754804],
       [-0.07438196],
       [-0.07121588],
       [-0.06707955],
       [-0.06233043],
       [-0.05758131],
       [-0.0541599 ],
       [-0.05257686],
       [-0.05099382],
       [-0.04833839],
       [-0.04200623],
       [-0.03567407],
       [-0.02883125],
       [-0.00666869],
       [ 0.01549388],
       [ 0.03765645],
       [ 0.04919732],
       [ 0.05869556],
       [ 0.06819381],
       [ 0.07360678],
       [ 0.07677286],
       [ 0.07993895],
       [ 0.07810058],
       [ 0.07018537],
       [ 0.06227017],
       [ 0.05966581],
       [ 0.07233014],
       [ 0.08499446],
       [ 0.09816944],
       [ 0.11874897],
       [ 0.13932849],
       [ 0.15990802],
       [ 0.14877567],
       [ 0.13294527],
       [ 0.11711486],
       [ 0.11200828],
       [ 0.11200828],
       [ 0.11200828],
       [ 0.11200828],
       [ 0.11200828],
       [ 0.11200828],
       [ 0

array([[-0.1488952 ],
       [-0.12243215],
       [-0.0959691 ],
       [-0.06950605],
       [-0.06438417],
       [-0.06438417],
       [-0.06438417],
       [-0.05931565],
       [-0.05104594],
       [-0.04277624],
       [-0.03520012],
       [-0.02858436],
       [-0.0219686 ],
       [-0.01572631],
       [-0.01076449],
       [-0.00580266],
       [-0.00057408],
       [ 0.01265745],
       [ 0.02588897],
       [ 0.0391205 ],
       [ 0.04402897],
       [ 0.04733685],
       [ 0.05064473],
       [ 0.05822084],
       [ 0.06814449],
       [ 0.07806813],
       [ 0.0775346 ],
       [ 0.06430308],
       [ 0.05107155],
       [ 0.04509602],
       [ 0.05998149],
       [ 0.07486696],
       [ 0.08964572],
       [ 0.10287724],
       [ 0.11610877],
       [ 0.12934029],
       [ 0.12384494],
       [ 0.11557524],
       [ 0.10730553],
       [ 0.10015624],
       [ 0.09354048],
       [ 0.08692472],
       [ 0.0867113 ],
       [ 0.09332707],
       [ 0.09994283],
       [ 0

np.float64(0.1233102803339485)

In [6]:
# generating null distribution
num_shuffles = 1000
shuffled_disparities = []
dr_to_shuffle = resampled_dr.to_numpy() # get numpy copy of df

for _ in range(num_shuffles):
    # 3. Shuffle the layer order of one trajectory
    np.random.shuffle(dr_to_shuffle)
    
    # 4. Calculate disparity against the shuffled version
    _, _, shuffled_disp = procrustes(resampled_dr.to_numpy(), dr_to_shuffle)
    shuffled_disparities.append(shuffled_disp)

shuffled_disparities = np.array(shuffled_disparities)
# display(shuffled_disparities)
print("Null Distribution (from shuffles):")
print(f"  Mean: {np.mean(shuffled_disparities):.4f}")
print(f"  Std. Dev.: {np.std(shuffled_disparities):.4f}")


p_value = np.sum(shuffled_disparities <= observed_disparity) / num_shuffles
print(f"\nP-value: {p_value}")

Null Distribution (from shuffles):
  Mean: 0.0000
  Std. Dev.: 0.0000

P-value: 1.0
