# Tensor Rank Decomposition

Needs its own vis and notebook because its fundamentally different to a 2D factorisation.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler
import factor
import selection
import vis
%matplotlib inline

In [None]:
SAVE = False
NonNeg = False

In [None]:
# Basic usage
df = pd.read_csv('data/p70m.csv')
df['step'] = df['models'].str.extract(r'_(\d+)').astype(int)
df = df.set_index('step').drop('models', axis=1)
df.columns.name = "Tasks"

Xcols = [r for r in df.columns if "std" not in r]
Scols = [r for r in df.columns if "std" in r]
X = df[Xcols]
stds = df[Scols]

In [None]:
# Scale on columns (todo: think about how this applies to TENSORS)
# NOTE sensitivity to scale
# scaled = X - X.min(axis=0)
# scaled = scaled / X.max(axis=0)
scaled = X

In [None]:
# Convert matrix into a tensor:

# Step 1: Split the column names into measure and task
tasks = X.columns.str.split('/').str[1].unique()
measures = X.columns.str.split('/').str[0].unique()

# Step 2: Reshape into 3D array
steps = X.index.values
T = np.zeros((len(steps), len(tasks), len(measures)))

#T *= np.array([1, 1, .25])[None, None, :]

# Fill the array (or could cast as a pd.MultiIndex
for i, task in enumerate(tasks):
    for j, measure in enumerate(measures):
        T[:, i, j] = scaled[f'{measure}/{task}'].values

# Make the tasks equally important in terms of variance
for i in range(len(measures)):
    T[:, :, i] /= np.std(T[:, :, i].ravel())


In [None]:
# Visualise the traces
fig = vis.tensor_traces(T, (steps, tasks, measures))
fig.update_layout(
    title="Input tensor",
)
# if SAVE:
#     fig.write_image("plots/trd-input.pdf", scale=2)
fig.show()

# Tensor rank decomposition

Note: typically done with alternating least squares - fix all dimensions and solve for one
(its computationally simpler but can get stuck in local minima)

Here I'm trying the "all at once" optimization with no constraints


In [None]:
trd = factor.TRD(positive=[])
errs = selection.cross_validate(T, trd, max_factors=5, n_folds=5, repeats=1)  # 10,2
fig = vis.crossval(*errs, "Tensor Rank Decomposition (logscale)")
fig.update_yaxes(type="log", exponentformat="power")
if SAVE:
    fig.write_image("plots/trd-cv.pdf")
fig.show()

In [None]:
dims = 4  # doesn't pick up "dm_mathematics until component 4...."
trd = factor.TRD(dims=dims, positive=[])
M = trd.fit(T)

fig = vis.tensor_traces(M, (steps, tasks, measures))
fig.update_layout(
    title=f"TRD reconstruction ({dims} components)"
)
if SAVE:
    fig.write_image("plots/trd-reconstruct.pdf", scale=2)
fig.show()


In [None]:
# swap scores and task loadings
flip = 1
trd.factors[0][flip] *= -1
trd.factors[1][flip] *= -1


In [None]:
components = [f"Z{i}" for i in range(dims)]
labels = "Model scores", "Task loadings", "Metric loadings"
values = [steps, tasks, measures]

import plotly.graph_objects as go

fig = go.Figure()

for c, name in enumerate(components):
    comp = trd.factors[0][c]
    comp = comp / np.abs(comp).max()
    fig.add_trace(go.Scatter(x=steps, y=comp, mode='lines', name=name))

fig.update_layout(
    xaxis_type="log",
    xaxis_title="Step",
    yaxis_title="Model Score",
    legend_title="Components",
    width=800,
    height=600,
    title="Model Score (normalised)",
)

if SAVE:
    fig.write_image("plots/trd-scores.pdf", scale=2)
fig.show()

In [None]:
df = pd.DataFrame(
    data=trd.factors[1].T,
    index=tasks,
    columns=components,
)
dfs = df / df.max(axis=0)
fig = vis.heatmap(dfs, zmin=-1, zmax=1, width=5, height=9, reversescale=True, title="Task loadings (normalised)")
if SAVE:
    fig.write_image("plots/trd-tasks.png", scale=2)
fig.show()


In [None]:
df = pd.DataFrame(
    data=trd.factors[2].T,
    index=measures,
    columns=components,
).T
df.columns.name = "Measure"
#df_s =  (df / df.abs().max())
df_s =  (df.T / df.T.abs().max()).T


fig = vis.heatmap(df_s, width=4, height=5, zmin=-1, zmax=1,reversescale=True, title="Measure loadings (Normalised)")

if SAVE:
    fig.write_image("plots/trd-measures.png", scale=2)
fig.show()
