
# Linear Factorisation on Pythia Models
## (as featured in milestones 0 and 1)


In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Local files
import factor
import selection
import vis
import data

In [None]:

SAVE = False  # Export figures?

model, PRE_SCALING = factor.PCA(), "standard"
#model, PRE_SCALING = factor.NMF(method="mult"), "positive"
#model, PRE_SCALING = factor.NMF(method="opt"), "positive"

SCALE_SCORES = False  # Rescale to bring out relative magnitudes
SCALE_LOADINGS = False  # Rescale to bring out relative magnitudes


In [None]:
# Load initial pythia data
model_size = "70m"
X, stds = data.load_pythia_tensor(model_size)
fig = vis.traces(X, ["llc", "loss", "trace"], stds=stds)
fig.update_layout(title=f"Psycometrics Input: Pythia {model_size}")
fig.show()


In [None]:
# Apply data scaling
scaled = X.values.copy()  # We're about to modify them inplace

if PRE_SCALING == "positive":
    scaled -= scaled.min(axis=0)
    scaled /= scaled.max(axis=0)
elif PRE_SCALING == "standard":
    scaled -= scaled.mean(axis=0)
    scaled /= scaled.std(axis=0)
else:
    raise NotImplementedError()
    
scaled = pd.DataFrame(
    scaled,
    columns=X.columns,
    index=X.index,
)

In [None]:
# Holdout model selection
errs = selection.cross_validate(scaled, model, max_factors=6, n_folds=20, repeats=1)
fig = vis.crossval(*errs, method_name=model.name)
if SAVE:
    fig.write_image(f"plots/{model.name}-holdout.png", scale=2)
fig.show()

In [None]:
# TODO: get offset working
n_components = 5
model.max_iters = 500
steps = X.index.values
result = model.fit(scaled, n_components)
print("plotting", flush=True)
component_names = [f"PC{i+1}" for i in range(n_components)]
W = model.U  # Scores
H = model.V  # Loadings
W_scaled = W / W.max(axis=0)[None, :]  # Relative



tasks = ["llc", "loss", "trace"]
recon = pd.DataFrame(
    model.R,
    columns=X.columns,
    index=X.index,
)
fig = vis.traces(recon, tasks, cols=2)

target = pd.DataFrame(
    scaled,
    columns=X.columns,
    index=X.index,
)
vis.traces(target, tasks, col=2, fig=fig)

fig.update_layout(
    title=f"{model.name} ({n_components} components): reconstruction (left), input (right)",
    width=1200,
    height=600,
)
fig.show()

In [None]:
fig = go.Figure()

for c, comp in enumerate(component_names):
    fig.add_trace(
        go.Scatter(
            x=steps, y=W_scaled[:, c], name=comp, mode='lines',
        )
    )
fig.update_layout(
    width=800,
    height=600,
    xaxis=dict(type='log'),
    xaxis_title='Steps',
    yaxis_title='Score',
    title="Normalised Model Scores"
)
if SAVE:
    fig.write_image(f"plots/{model.name}-scores.png", scale=2)
fig.show()

In [None]:
SCALE_LOADINGS = True

config = dict(
    width=6,
    height=12,
    reversescale=True,
)

if SCALE_LOADINGS:
    H_scaled = H / np.max(np.abs(H), axis=1)[:, None]
    df = pd.DataFrame(
        data=H_scaled.T,
        index=X.columns,
        columns=component_names,
    )
    # zmin=0, zmax=1,
    fig = vis.heatmap(df, **config, title="Normalised Task Loadings")
else:
    df = pd.DataFrame(
        data=H.T,
        index=X.columns,
        columns=component_names,
    )
    fig = vis.heatmap(df, **config, title="Task Loadings")
    
if SAVE:
    fig.write_image("plots/NMF_scores.png", scale=2)
fig.show()