In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import fitting as fit
import data_utils as dat
import vis
import plotly.graph_objects as go

# Train/Test Splits

Problem - how do we extract the "interesting interval" across different models and tasks.

In [None]:
#Available sizes: ['14m', '31m', '70m', '160m', '410m', '1b']
msizes = ['160m', '410m']
tasks = ["arxiv", "wikipedia_en"]
functions = {
    "Power Law": fit.ShiftedPowerLaw,
    "Exponential": fit.ShiftedExponential,
    # "Power Law (4P)": cf.DoubleShiftedPowerLaw,
    # "ExpExp": cf.DoubleExponential,
}

# TODO: we need this for every model (and maybe for every model/task combination)
step_start = 512  # are we cropping too early?
step_end = 80000
step_cutoff = 10000

for msize in msizes:
    df_llc, df_loss = dat.load_dfs(msize, data_path="data")

    for task in tasks:
        trace = dat.trim_trace(df_llc, df_loss, task, step_start, step_end)
        
        train, test = dat.split(trace, step_cutoff)

        fig = go.Figure()
        vis.plot_split(fig, train, test)

        # Now plot all the models:
        for model_name, model in functions.items():
            
            result = fit.min_fit(train.x, train.y, model)
            vis.plot_result(fig, trace.x, result, name=model_name)

        
        fig.update_layout(
            title=f'Predictions after step {step_cutoff} - {task} @ {msize}',
            xaxis_title='LLC / 100',
            yaxis_title='Loss',
            width=900, height=600,
        )
        fig.show()

# The plots in shifted space don't look so good

While the log(y-y*) makes the fit look amazing, its using the function's parameters to distort the space so of course it looks amazing.
However, it can make the unseen data look worse.

In [None]:
# Plots in shifted space only make sense for the power law
msizes = ['160m', '410m']
tasks = ["arxiv", "wikipedia_en"]
function = fit.ShiftedPowerLaw

# TODO: we need this for every model (and maybe for every model/task combination)
step_start = 512  # are we cropping too early?
step_end = 80000
step_cutoff = 10000

for msize in msizes:
    df_llc, df_loss = dat.load_dfs(msize, data_path="data")

    for task in tasks:
        trace = dat.trim_trace(df_llc, df_loss, task, step_start, step_end)
        train, test = dat.split(trace, step_cutoff)
        result = fit.min_fit(train.x, train.y, fit.ShiftedPowerLaw2)
        
        fig = go.Figure()
        shift = result.params_dict  # contains y*
        vis.plot_split(fig, train, test, shift=result.params_dict)
        vis.plot_result(fig, trace.x, result, name="Power law fit", shift=result.params_dict)

        # As we're using shifted powerlaw, we should go back to logspace
        fig.update_xaxes(title_text="L - L*", type="log")
        fig.update_yaxes(title_text="LLC - LLC*", type="log")

        
        fig.update_layout(
            title=f'Predictions after step {step_cutoff} - {task} @ {msize}',
            xaxis_title='LLC / 100',
            yaxis_title='Loss',
            width=900, height=600,
        )
        fig.show()

# Do fit methods make a difference?

### ANSWER: a bit but not so much as to change which functional form is preferred.


In [None]:
msize="410m"
df_llc, df_loss = dat.load_dfs(msize, data_path="data")
tasks = ["arxiv", "wikipedia_en", "full"] 
functions = {
    "Power Law": fit.ShiftedPowerLaw2,
    "Exponential": fit.ShiftedExponential,
}
fit_methods = {
    "minimize": fit.min_fit,
    "ODR": fit.odr_fit,
}
hk_noise = {
   "rel_noise": True,
    "": False,
}

for task in tasks:
    trace = dat.trim_trace(df_llc, df_loss, task, step_start, step_end)
    train, test = dat.split(trace, step_cutoff)

    fig = go.Figure()
    vis.plot_split(fig, train, test)

    for model_name, model in functions.items():
        for fit_name, fit_fn in fit_methods.items():
            for rel_name, rel_noise in hk_noise.items():
                result = fit_fn(train.x, train.y, model, rel_noise=rel_noise)
                vis.plot_result(
                    fig, trace.x, result,
                    name=f"{model_name} {fit_name} {rel_name}"
                )

    
    fig.update_layout(
        title=f'Predictions after step {step_cutoff} - {task} @ {msize}',
        xaxis_title='LLC / 100',
        yaxis_title='Loss',
        width=900, height=600,
    )
    fig.show()

# Can we use pcov to estimate uncertainty?
Yes but its an approximation?

In [None]:
#Available sizes: ['14m', '31m', '70m', '160m', '410m', '1b']
msizes = ['160m', '410m']
tasks = ["arxiv", "wikipedia_en"]
functions = {
    "Power Law": fit.ShiftedPowerLaw,
    "Exponential": fit.ShiftedExponential,
    # "Power Law (4P)": cf.DoubleShiftedPowerLaw,
    # "ExpExp": cf.DoubleExponential,
}

# TODO: we need this for every model (and maybe for every model/task combination)
step_start = 512  # are we cropping too early?
step_end = 80000
step_cutoff = 10000

colors = vis.assign_cols(functions)


for msize in msizes:
    df_llc, df_loss = dat.load_dfs(msize, data_path="data")

    for task in tasks:
        trace = dat.trim_trace(df_llc, df_loss, task, step_start, step_end)
        
        train, test = dat.split(trace, step_cutoff)

        fig = go.Figure()
        vis.plot_split(fig, train, test)

        # Now plot all the models:
        for model_name, model in functions.items():
            
            result = fit.min_fit(train.x, train.y, model)
           
            _, y_test_mu, y_test_std = result.sample(test.x, 30)
            llh = fit.normal_log_likelihood(test.y, y_test_mu, y_test_std)

            vis.sample_result(fig, trace.x, result, colors[model_name],
                             name=f"{model_name}: LLH~={llh:.1f}")

        fig.update_layout(
            title=f'Predictions after step {step_cutoff} - {task} @ {msize}',
            xaxis_title='LLC / 100',
            yaxis_title='Loss',
            width=900, height=600,
        )
        fig.show()