In [None]:
#!pip install ipympl
%load_ext autoreload
%autoreload 2
import fitting as fit
import data_utils as dat
import vis
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots
import pandas as pd

In [None]:
# Load Data
mcode = "410m-dense"
msize = mcode.split("-")[0]
tasks = ["github", "stackexchange", "arxiv", "pile-cc"]  # and full?
df_llc, df_loss = dat.load_dfs(mcode, data_path="data")
step_start = 2000  # are we cropping too early?
step_end = 80000
step_cutoff = 20000
scale = 1000.  # rescale steps for the time-fits only

colors = vis.assign_cols(df_llc.columns)

# Tweak so stuff fits...
fwidth = 1300
fheight= 450


WRITE = False


# Parametric loss over time
Perhaps we should be using RMSE for everything?

In [None]:
# Candidate functions:
functions = [
    (fit.OffsetPowerLaw2, {}, "Powerlaw"),
    (fit.OffsetExponential, dict(par0=[1., 1., -.1]), "Exponential"),
]

 # Make a grid layout
titles = []
for f in functions:
    for s in ["Fit", "Holdout"]:
        titles.append(f"{f[2]} - {s}")

fig = make_subplots(
    rows=len(functions), cols=2,
    subplot_titles=titles,
    horizontal_spacing=0.1,
    vertical_spacing=0.1,
)

fnames = [f[2] for f in functions]
loss_fits = {f: {} for f in fnames}
loss_vals = {f: {} for f in fnames}

for task in tasks:
    llc, loss, steps = dat.trim_trace(df_llc, df_loss, task, step_start, step_end)
    trace = dat.Trace(steps/scale, loss, steps)  # loss vs step
    train, test = dat.split(trace, step_cutoff)
    color = colors[task]
    color2 = vis.add_color(color)  # for heldout data
    
    
    for f_ind, (function, args, fname) in enumerate(functions):
                
        # Fit full and validation results
        loss_fits[fname][task] = result = fit.min_fit(trace.x, trace.y, function, **args)
        loss_vals[fname][task] = v_result = fit.min_fit(train.x, train.y, function, **args)
        
        # Evaluate the metrics
        y_fit = result.f(trace.x)
        RMSE_fit = fit.rmse(trace.y, y_fit)
        y_val = v_result.f(test.x)
        RMSE_val = fit.rmse(test.y, y_val)
        description = f"{task}<br>RMSE={RMSE_fit:.4f} (fit),<br>RMSE={RMSE_val:.4f} (holdout)"

        # Left column: plot the fit
        subplot = dict(row=f_ind+1, col=1)
        vis.plot_data(fig, trace.x, trace.y, color=color, showlegend=False, size=5, xscale=scale, subplot=subplot) 
        vis.plot_result(fig, trace.x, result, name=description, xscale=scale, color=color, subplot=subplot,
                        showlegend=True, legendgroup=fname)
        fig.update_xaxes(title_text="Step", type="log", **subplot)
        fig.update_yaxes(title_text=r"$\text{Loss }L$", **subplot)

        # Right column: plot the validation
        subplot = dict(row=f_ind+1, col=2)
        vis.plot_data(fig, train.x, train.y, color=color, showlegend=False, size=5, xscale=scale, subplot=subplot) 
        vis.plot_data(fig, test.x, test.y, color=color2, showlegend=False, size=5, xscale=scale, subplot=subplot) 
        vis.plot_result(fig, trace.x, v_result, name=task, xscale=scale, color=color, showlegend=False, subplot=subplot)
        fig.update_xaxes(title_text="Step", type="log", **subplot)
        fig.update_yaxes(title_text=r"$\text{Loss }L$", **subplot)


fig.update_layout(
    title="",
    width=fwidth,
    height=fheight * len(functions),
    showlegend=True,
    legend_tracegroupgap=250,  # annoying - have to eyeball this
)

if WRITE:
    fname = f"plots/parametric_loss_{msize}.pdf"
    fig.write_image(fname)  # yes we have to repeat to avoid "loading mathjax" in the bottom left
    import time
    print("Waiting")
    time.sleep(1)
    fig.write_image(fname)
    print(f"Done. See {fname}")

fig.show()  # as soon as you include latex you loose the ability to see the labels in html

# Let's just outright reject exponential - its not a candidate
if "Exponential" in loss_fits:
    del loss_fits["Exponential"]
    del loss_vals["Exponential"]

# Clear winner --> delete exponential

# Step 2: LLC vs time

In [None]:
# Basically a cut'n'paste job

# Candidate functions:
functions = [
    (fit.OffsetPowerLaw, dict(par0=[10., -10., .1]), "Powerlaw"),
    (fit.OffsetLogarithm, {}, "Logarithm"),
]

# Make a grid layout
titles = []
for f in functions:
    for s in ["Fit", "Holdout"]:
        titles.append(f"{f[2]} - {s}")

fig = make_subplots(
    rows=len(functions), cols=2,
    subplot_titles=titles,
    horizontal_spacing=0.1,
    vertical_spacing=0.1,
)

report = []
fnames = [f[2] for f in functions]
llc_fits = {f: {} for f in fnames}
llc_vals = {f: {} for f in fnames}


for task in tasks:
    llc, loss, step = dat.trim_trace(df_llc, df_loss, task, step_start, step_end)
    trace = dat.Trace(steps/scale, llc, steps)  # llc vs step
    train, test = dat.split(trace, step_cutoff)
    color = colors[task]
    color2 = vis.add_color(color)  # for heldout data
    
    
    for f_ind, (function, args, fname) in enumerate(functions):
        # Save outputs for later use?
        llc_fits[fname][task] = result = fit.min_fit(trace.x, trace.y, function, **args)
        llc_vals[fname][task] = v_result = fit.min_fit(train.x, train.y, function, **args)
        
        # Evaluate the metrics
        y_fit = result.f(trace.x)
        RMSE_fit = fit.rmse(trace.y, y_fit)
        y_val = v_result.f(test.x)
        RMSE_val = fit.rmse(test.y, y_val)

        print(function.name, task, result.params_dict)
        
        description = f"{task}<br>RMSE={RMSE_fit:.4f} (fit),<br>RMSE={RMSE_val:.4f} (holdout)"

        # Left column: plot the fit
        subplot = dict(row=f_ind+1, col=1)
        vis.plot_data(fig, trace.x, trace.y, color=color, showlegend=False, size=5, xscale=scale, subplot=subplot) 
        vis.plot_result(fig, trace.x, result, name=description, xscale=scale, color=color, subplot=subplot,
                        showlegend=True, legendgroup=fname)
        fig.update_xaxes(title_text="Step", type="log", **subplot)
        fig.update_yaxes(title_text=r"$\text{Estimated and transformed LLC }\,\frac{1}{100}\hat{\lambda}$", **subplot)

        # Right column: plot the validation
        subplot = dict(row=f_ind+1, col=2)
        vis.plot_data(fig, train.x, train.y, color=color, showlegend=False, size=5, xscale=scale, subplot=subplot) 
        vis.plot_data(fig, test.x, test.y, color=color2, showlegend=False, size=5, xscale=scale, subplot=subplot) 
        vis.plot_result(fig, trace.x, v_result, name=task, xscale=scale, color=color, showlegend=False, subplot=subplot)
        fig.update_xaxes(title_text="Step", type="log", **subplot)
        fig.update_yaxes(title_text=r"$\text{Estimated and transformed LLC }\,\frac{1}{100}\hat{\lambda}$", **subplot)



fig.update_layout(
    title="",
    width=fwidth,
    height=fheight* len(functions),
    showlegend=True,
    legend_tracegroupgap=250,  # annoying - have to eyeball this
)

if WRITE:
    fname = f"plots/parametric_llc_{msize}.pdf"
    fig.write_image(fname)
    print(f"Done. See {fname}")

    
fig.show()


# Step 3: LLC vs loss

In [None]:
# Direct-fit candidates:
functions = [
    #(fit.DoubleOffsetPowerLaw, {}, "Powerlaw (4 param)"),
    #(fit.OffsetPowerLaw, {}, "Direct Powerlaw (3 par)"),
    #(fit.OffsetExponential, {}, "Exponential (3 param)"),
]
for a in llc_fits:
    for b in loss_fits:
        functions.append(
            ("construct", (a, b), f"{a}-{b}")
        )
functions

In [None]:


# Make a grid layout
titles = []
for f in functions:
    for s in ["Fit", "Holdout"]:
        titles.append(f"{f[2]} - {s}")

fig = make_subplots(
    rows=len(functions), cols=2,
    subplot_titles=titles,
    horizontal_spacing=0.1,
    vertical_spacing=0.1,
)


for task in tasks:
    trace = dat.trim_trace(df_llc, df_loss, task, step_start, step_end)
    train, test = dat.split(trace, step_cutoff)
    color = colors[task]
    color2 = vis.add_color(color)  # for heldout data    
    
    for f_ind, (function, args, fname) in enumerate(functions):

        
        if function == "construct":
            # Rather than fitting loss vs llc
            # We can mash the results together and make a "result" for it
            
            llc_result = llc_fits[args[0]][task]
            loss_result = loss_fits[args[1]][task]
            def mash(x):
                # Compute the latent step
                vstep = llc_result.model.inverse(x, llc_result.params)
                return loss_result.f(vstep)
            

            llc_vresult = llc_vals[args[0]][task]
            loss_vresult = loss_vals[args[1]][task]
            def vmash(x):
                # Compute the latent step
                vstep = llc_vresult.model.inverse(x, llc_vresult.params)
                return loss_vresult.f(vstep)
            
            result = fit.FitResult(mash, None, {}, None, None, None)
            v_result = fit.FitResult(vmash, None, {}, None, None, None)
            
        else:
            # Fit the trace from scratch
            result = fit.min_fit(trace.x, trace.y, function, **args)
            v_result = fit.min_fit(train.x, train.y, function, **args)
        
        # Evaluate the metrics
        y_fit = result.f(trace.x)
        RMSE_fit = fit.rmse(trace.y, y_fit)
        y_val = v_result.f(test.x)
        RMSE_val = fit.rmse(test.y, y_val)
        description = f"{task}<br>RMSE={RMSE_fit:.5f} (fit),<br>RMSE={RMSE_val:.5f} (holdout)"

        # Left column: plot the fit
        subplot = dict(row=f_ind+1, col=1)
        vis.plot_data(fig, trace.x, trace.y, color=color, showlegend=False, size=5, subplot=subplot) 
        vis.plot_result(fig, trace.x, result, name=description, color=color, subplot=subplot,
                        showlegend=True, legendgroup=fname)
        fig.update_yaxes(title_text=r"$\text{Loss }L$", **subplot)
        fig.update_xaxes(title_text=r"$\text{Estimated and transformed LLC }\,\frac{1}{100}\hat{\lambda}$", **subplot)

        # Right column: plot the validation
        subplot = dict(row=f_ind+1, col=2)
        vis.plot_data(fig, train.x, train.y, color=color, showlegend=False, size=5, subplot=subplot) 
        vis.plot_data(fig, test.x, test.y, color=color2, showlegend=False, size=5, subplot=subplot) 
        vis.plot_result(fig, trace.x, v_result, name=task, color=color, showlegend=False, subplot=subplot)
        fig.update_yaxes(title_text=r"$\text{Loss }L$", **subplot)
        fig.update_xaxes(title_text=r"$\text{Estimated and transformed LLC }\,\frac{1}{100}\hat{\lambda}$", **subplot)
        

fig.update_layout(
    title="",
    width=fwidth,
    height=fheight* len(functions),
    showlegend=True,
    legend_tracegroupgap=260,  # annoying - have to eyeball this
)

if WRITE:
    fname = f"plots/parametric_trajectory_{msize}.pdf"
    fig.write_image(fname)
    print(f"Done. See {fname}")

fig.show()

In [None]:
# Configure experiment
fit_method = fit.min_fit  # odr_fit is also an option
function = fit.OffsetPowerLaw
mcode = '410m-dense'
step_start = 2000  # As long as it is consistent
step_end = 80000
step_cutoff = 20000  # TODO: investigate if this point is in the train or the test set
df_llc, df_loss = dat.load_dfs(mcode, data_path="data")
trace = dat.trim_trace(df_llc, df_loss, "github", step_start, step_end)
train, test = dat.split(trace, step_cutoff)

# using the oracle parameters as an initalisation is not *technially cheating but it does seem to make a difference
#oracle = fit_method(trace.x, trace.y, function)
#result = fit_method(train.x, train.y, function, par0=oracle.params)

result = fit_method(train.x, train.y, function)

y_pred = result.f(test.x)
rmse = fit.rmse(test.y, y_pred)
rmse
