In [1]:
# Import necessary libraries
import os
import sys

# Enable automatic module reloading (great for development)
%load_ext autoreload
%autoreload 2

# Add the directory with your visualization module to the path
sys.path.append("/Users/liam/quests/lsoc-psych/datasets")

# Import the visualization functions
from joint_pca import load_trajectory_data, TrajectoryPCA, load_token_mapping

In [2]:
from joint_pca import load_trajectory_data
experiment_name = "EXP000"
dataset_name = "github"
num_contexts = 50
# model_sizes = ['6.9b', '160m', '31m', '2.8b', '1.4b', '70m', '410m', '1b']
model_sizes = None
trajectory_data = load_trajectory_data(experiment_name, dataset_name, 
                                       model_sizes=model_sizes, 
                                       num_contexts=num_contexts)
tokens_dict = load_token_mapping(experiment_name, dataset_name)

['14m', '6.9b', '160m', '1.4b', '2.8b', '410m', '70m', '31m', '1b']
Loading columns for 50 contexts
Selected context indices: [2, 15, 28, 41, 54]... (total: 50)


Loading trajectory data: 100%|██████████| 9/9 [01:06<00:00,  7.34s/it]


In [3]:
from joint_pca import TrajectoryPCA

step_range = [None, None]
n_components=10
n_sparse_components=0
scale=False
run_at_init=True
pca_handler = TrajectoryPCA(trajectory_data,
                            step_range = step_range,
                            n_components=n_components,
                            n_sparse_components=n_sparse_components,
                            scale=scale,
                            run_at_init=run_at_init,
                            sparse_pca_params=None,
                            dataset_name=dataset_name,
                            num_contexts=num_contexts,)

Found 25600 common columns across all 9 model sizes
Concatenated matrix shape: (1386, 25600)
Regular PCA: Top 10 components explain 95.37% of variance
Individual explained variance: [0.7710864  0.11300656 0.02802772 0.0168818  0.00718533 0.00634188
 0.00415607 0.003117   0.00200962 0.0018531 ]


In [4]:
from joint_pca import print_enhanced_pc_loadings

print_enhanced_pc_loadings(pca_handler)

REGULAR PCA COMPONENTS (Top 10 Features per Component)

EXPLAINED VARIANCE SUMMARY:
Component Explained Variance (%) Cumulative Variance (%)
      PC1                 77.11%                  77.11%
      PC2                 11.30%                  88.41%
      PC3                  2.80%                  91.21%
      PC4                  1.69%                  92.90%
      PC5                  0.72%                  93.62%
      PC6                  0.63%                  94.25%
      PC7                  0.42%                  94.67%
      PC8                  0.31%                  94.98%
      PC9                  0.20%                  95.18%
     PC10                  0.19%                  95.37%



PRINCIPAL COMPONENT 1 (77.11% variance)
----------------------------------------------------------------------------------------------------
            Feature  Loading
 context_171_pos_97   0.0092
context_314_pos_500   0.0091
context_457_pos_300   0.0088
context_535_pos_403   0.0088




In [5]:
from joint_pca import TrajectoryPlotter

plotter = TrajectoryPlotter(pca_handler)

fig = plotter.plot_pcs_over_time()
#fig.show()

raw_filename = plotter._get_filename()
filename = raw_filename + "_pcs_over_time"
fig.write_image(filename + ".png", scale=2)
fig.write_image(filename + ".pdf")
fig.write_html(filename + ".html")


for comp in range(1, 10+1):
    # comp with one leading zero in name
    filename = raw_filename + f"_loaded_features_comp{comp:02d}"
    fig = plotter.plot_top_loaded_features_for_component(component=comp, token_mapping=tokens_dict)
    fig.write_image(filename + ".png", scale=2)
    fig.write_html(filename + ".html")
    fig.write_image(filename + ".pdf")


# fig = plotter.plot_top_loaded_features_for_component(0, token_mapping=tokens_dict)
# fig.show()

In [6]:
from joint_pca import TrajectoryPCA

step_range = [1000, None]
n_components=0
n_sparse_components=10
scale=False
run_at_init=True
pca_handler = TrajectoryPCA(trajectory_data,
                            step_range = step_range,
                            n_components=n_components,
                            n_sparse_components=n_sparse_components,
                            scale=scale,
                            run_at_init=run_at_init,
                            sparse_pca_params={'alpha': 5},
                            dataset_name=dataset_name,
                            num_contexts=num_contexts,)

Found 25600 common columns across all 9 model sizes
Concatenated matrix shape: (1287, 25600)
Using full matrix for sparse PCA (no regular PCA performed): (1287, 25600)
Sparse PCA: 10 components extracted
Sparsity of components (fraction of zero values): [0.706640625, 0.75609375, 0.8569921875, 0.7283984375, 0.808046875, 0.7083203124999999, 0.820390625, 0.7345312500000001, 0.8752734375, 0.7445703125]


In [7]:
from joint_pca import TrajectoryPlotter
plotter = TrajectoryPlotter(pca_handler)

fig = plotter.plot_pcs_over_time()
fig.show()

raw_filename = plotter._get_filename()
filename = raw_filename + "_pcs_over_time"
fig.write_image(filename + ".png", scale=2)
fig.write_image(filename + ".pdf")
fig.write_html(filename + ".html")

In [8]:
for comp in range(1, 10+1):
    # comp with one leading zero in name
    filename = raw_filename + f"_loaded_features_comp{comp:02d}"
    fig = plotter.plot_top_loaded_features_for_component(component=comp, token_mapping=tokens_dict)
    fig.write_image(filename + ".png", scale=2)
    fig.write_image(filename + ".pdf")
    fig.write_html(filename + ".html")

In [9]:
from joint_pca import TrajectoryPCA

step_range = [2000, 30000]
n_components=0
n_sparse_components=10
scale=False
run_at_init=True
pca_handler = TrajectoryPCA(trajectory_data,
                            step_range = step_range,
                            n_components=n_components,
                            n_sparse_components=n_sparse_components,
                            scale=scale,
                            run_at_init=run_at_init,
                            sparse_pca_params={'alpha': 5},
                            dataset_name=dataset_name,
                            num_contexts=num_contexts,)



Found 25600 common columns across all 9 model sizes
Concatenated matrix shape: (261, 25600)
Using full matrix for sparse PCA (no regular PCA performed): (261, 25600)
Sparse PCA: 10 components extracted
Sparsity of components (fraction of zero values): [0.86140625, 0.8790625, 0.857578125, 0.8347265625, 0.8387890625, 0.9305859375, 0.8671484375, 0.9070703125, 0.8512109375, 0.9169921875]


In [10]:
import pandas as pd
columns_of_interest = [
    "context_2_pos_210",
    "context_2_pos_287",
    "context_54_pos_71",
    "context_54_pos_125",
    "context_54_pos_467",
    "context_119_pos_60",
    "context_119_pos_70",
    "context_119_pos_71",
    "context_119_pos_74",
    "context_119_pos_110",
    "context_119_pos_138",
    "context_119_pos_174",
    "context_119_pos_234",
    "context_119_pos_283",
    "context_119_pos_289",
    "context_119_pos_291",
    "context_132_pos_223",
    "context_132_pos_242",
    "context_262_pos_93",
    "context_288_pos_138",
    "context_392_pos_56",
    "context_444_pos_279",
    "context_483_pos_83",
    "context_509_pos_254",
    "context_561_pos_111",
    "context_587_pos_288",
    "context_600_pos_307",
    "context_600_pos_419",
    "context_600_pos_459",
    "context_600_pos_461"
]
loadings_on_columns = pca_handler.get_specific_column_loadings(columns_of_interest)

# print the full df horizontally
print(loadings_on_columns)



similarity = pca_handler.compute_cosine_with_spc(columns_of_interest)
print(similarity)


                Feature   SPC1   SPC2    SPC3    SPC4    SPC5   SPC6    SPC7    SPC8    SPC9  \
0     context_2_pos_210 0.0000 0.0000  0.0003  0.0000 -0.0000 0.0622  0.0000  0.0882 -0.0000   
1     context_2_pos_287 0.0000 0.0000  0.0000  0.0000 -0.0000 0.0000 -0.0025  0.0000 -0.0000   
2     context_54_pos_71 0.0000 0.0017  0.0000  0.0000 -0.0000 0.0481  0.0000  0.0639 -0.0000   
3    context_54_pos_125 0.0153 0.0000  0.0000 -0.0146 -0.0030 0.0000  0.0336  0.0295 -0.0000   
4    context_54_pos_467 0.0000 0.0058  0.0000  0.0000 -0.0000 0.0780  0.0000  0.0000 -0.0000   
5    context_119_pos_60 0.0000 0.0000  0.0000  0.0000 -0.0000 0.0238  0.0230  0.0595 -0.0034   
6    context_119_pos_70 0.0000 0.0100  0.0000  0.0000 -0.0000 0.0374  0.0000 -0.0000 -0.0284   
7    context_119_pos_71 0.0000 0.0000  0.0016  0.0039  0.0012 0.0622  0.0000  0.0000 -0.0000   
8    context_119_pos_74 0.0000 0.0000  0.0000  0.0000 -0.0000 0.0676  0.0273  0.0000 -0.0000   
9   context_119_pos_110 0.0000 0.0000  0

In [11]:
for index in range(1,11):
    similarity = pca_handler.compute_cosine_with_spc(columns_of_interest, pc_idx=('sparse', index))
    # rounded to five decimals
    print(f'Similarity with SPC{index}: {similarity:.5f}')



Similarity with SPC1: 0.02728
Similarity with SPC2: 0.02016
Similarity with SPC3: -0.00389
Similarity with SPC4: -0.00630
Similarity with SPC5: -0.00151
Similarity with SPC6: 0.20872
Similarity with SPC7: 0.02620
Similarity with SPC8: 0.13262
Similarity with SPC9: -0.00914
Similarity with SPC10: 0.03304


In [12]:
from joint_pca import TrajectoryPlotter
plotter = TrajectoryPlotter(pca_handler)

loadings_on_columns.to_csv(plotter._get_filename() + "_loadings_on_Dan_columns.csv")


fig = plotter.plot_pcs_over_time()
fig.show()

raw_filename = plotter._get_filename()
filename = raw_filename + "_pcs_over_time"
fig.write_image(filename + ".png", scale=2)
fig.write_image(filename + ".pdf")
fig.write_html(filename + ".html")

In [13]:
for comp in range(1, 10+1):
    # comp with one leading zero in name
    filename = raw_filename + f"_loaded_features_comp{comp:02d}"
    fig = plotter.plot_top_loaded_features_for_component(component=comp, token_mapping=tokens_dict)
    fig.write_image(filename + ".png", scale=2)
    fig.write_image(filename + ".pdf")
    fig.write_html(filename + ".html")

# fig = plotter.plot_top_loaded_features_for_component(6, token_mapping=tokens_dict)
# fig.show()

In [14]:
from joint_pca import TrajectoryPCA

step_range = [1000, None]
n_components=5
n_sparse_components=5
scale=False
run_at_init=True
pca_handler = TrajectoryPCA(trajectory_data,
                            step_range = step_range,
                            n_components=n_components,
                            n_sparse_components=n_sparse_components,
                            scale=scale,
                            run_at_init=run_at_init,
                            sparse_pca_params={'alpha': 5},
                            dataset_name=dataset_name,
                            num_contexts=num_contexts,)

Found 25600 common columns across all 9 model sizes
Concatenated matrix shape: (1287, 25600)
Regular PCA: Top 5 components explain 80.90% of variance
Individual explained variance: [0.66704518 0.07896732 0.03161048 0.01791193 0.01341744]
Residual matrix shape after regular PCA: (1287, 25600)
Sparse PCA: 5 components extracted
Sparsity of components (fraction of zero values): [0.7980078125, 0.8770703125, 0.85453125, 0.8985546875, 0.880625]


In [15]:
from joint_pca import TrajectoryPlotter
plotter = TrajectoryPlotter(pca_handler)

fig = plotter.plot_pcs_over_time()
fig.show()

raw_filename = plotter._get_filename()
filename = raw_filename + "_pcs_over_time"
fig.write_image(filename + ".png", scale=2)
fig.write_image(filename + ".pdf")
fig.write_html(filename + ".html")

In [16]:
for comp in range(1, 10+1):
    # comp with one leading zero in name
    filename = raw_filename + f"_loaded_features_comp{comp:02d}"
    fig = plotter.plot_top_loaded_features_for_component(component=comp, token_mapping=tokens_dict)
    fig.write_image(filename + ".png", scale=2)
    fig.write_image(filename + ".pdf")
    fig.write_html(filename + ".html")

# fig = plotter.plot_top_loaded_features_for_component(6, token_mapping=tokens_dict)
# fig.show()

In [17]:
from joint_pca import TrajectoryPCA

step_range = [2000, 30000]
n_components=0
n_sparse_components=10
scale=False
run_at_init=True
pca_handler = TrajectoryPCA(trajectory_data,
                            step_range = step_range,
                            n_components=n_components,
                            n_sparse_components=n_sparse_components,
                            scale=scale,
                            run_at_init=run_at_init,
                            sparse_pca_params={'alpha': 15},
                            dataset_name=dataset_name,
                            num_contexts=num_contexts,)



Found 25600 common columns across all 9 model sizes
Concatenated matrix shape: (261, 25600)
Using full matrix for sparse PCA (no regular PCA performed): (261, 25600)
Sparse PCA: 10 components extracted
Sparsity of components (fraction of zero values): [0.9453125, 0.946953125, 0.975859375, 0.9459765625, 0.9470703125, 0.999609375, 0.9644140625, 0.9652734375, 0.9488671875, 0.9616015625]


In [18]:
import pandas as pd
columns_of_interest = [
    "context_2_pos_210",
    "context_2_pos_287",
    "context_54_pos_71",
    "context_54_pos_125",
    "context_54_pos_467",
    "context_119_pos_60",
    "context_119_pos_70",
    "context_119_pos_71",
    "context_119_pos_74",
    "context_119_pos_110",
    "context_119_pos_138",
    "context_119_pos_174",
    "context_119_pos_234",
    "context_119_pos_283",
    "context_119_pos_289",
    "context_119_pos_291",
    "context_132_pos_223",
    "context_132_pos_242",
    "context_262_pos_93",
    "context_288_pos_138",
    "context_392_pos_56",
    "context_444_pos_279",
    "context_483_pos_83",
    "context_509_pos_254",
    "context_561_pos_111",
    "context_587_pos_288",
    "context_600_pos_307",
    "context_600_pos_419",
    "context_600_pos_459",
    "context_600_pos_461"
]
loadings_on_columns = pca_handler.get_specific_column_loadings(columns_of_interest)

# print the full df horizontally
print(loadings_on_columns)

for index in range(1,11):
    similarity = pca_handler.compute_cosine_with_spc(columns_of_interest, pc_idx=('sparse', index))
    # rounded to five decimals
    print(f'Similarity with SPC{index}: {similarity:.5f}')

                Feature   SPC1   SPC2    SPC3   SPC4    SPC5   SPC6   SPC7   SPC8    SPC9  SPC10
0     context_2_pos_210 0.0000 0.0000  0.1359 0.0000 -0.0000 0.0000 0.0000 0.0000 -0.0000 0.0000
1     context_2_pos_287 0.0000 0.0000 -0.0000 0.0000 -0.0000 0.0000 0.0000 0.0000 -0.0000 0.0000
2     context_54_pos_71 0.0000 0.0000  0.1078 0.0000 -0.0000 0.0000 0.0000 0.0000 -0.0000 0.0000
3    context_54_pos_125 0.0000 0.0664 -0.0000 0.0000 -0.0053 0.0000 0.0000 0.0000 -0.0000 0.0252
4    context_54_pos_467 0.0000 0.0000  0.0500 0.0000 -0.0000 0.0000 0.0000 0.0000 -0.0000 0.0000
5    context_119_pos_60 0.0000 0.0153  0.0749 0.0000 -0.0000 0.0000 0.0000 0.0291 -0.0000 0.0000
6    context_119_pos_70 0.0000 0.0000 -0.0000 0.0000 -0.0000 0.0000 0.0058 0.0000 -0.0000 0.0428
7    context_119_pos_71 0.0000 0.0000  0.0236 0.0000 -0.0000 0.0000 0.0000 0.0000 -0.0000 0.0000
8    context_119_pos_74 0.0000 0.0000  0.0760 0.0000 -0.0000 0.0000 0.0000 0.0000 -0.0000 0.0000
9   context_119_pos_110 0.0000

In [19]:
from joint_pca import TrajectoryPlotter
plotter = TrajectoryPlotter(pca_handler)

fig = plotter.plot_pcs_over_time()
fig.show()

loadings_on_columns.to_csv(plotter._get_filename() + "_loadings_on_Dan_columns.csv")

raw_filename = plotter._get_filename()
filename = raw_filename + "_pcs_over_time"
fig.write_image(filename + ".png", scale=2)
fig.write_image(filename + ".pdf")
fig.write_html(filename + ".html")

for comp in range(1, 10+1):
    # comp with one leading zero in name
    filename = raw_filename + f"_loaded_features_comp{comp:02d}"
    fig = plotter.plot_top_loaded_features_for_component(component=comp, token_mapping=tokens_dict)
    fig.write_image(filename + ".png", scale=2)
    fig.write_image(filename + ".pdf")
    fig.write_html(filename + ".html")
    
