In [None]:
%load_ext autoreload
%autoreload 2
import powerlaw  as pl  # local
import pandas as pd
import numpy as np
import plotly.graph_objects as go

## Load data

In [None]:
msize = "410m"  # "70m", "160m", "410m", "1b"
start_step, end_step = 256, 80000
df_llc, df_loss = pl.load_traces(msize, data_path="data")
# Note to self - data in repo

# Example 1: Compare optimisers on the same model

In [None]:
model = pl.ShiftedPowerLaw

metrics = {
    "R2_log": pl.logspace_r2
}

fit_methods = {
    "curve_fit": pl.curve_fit,
    "LBFGS": pl.min_fit,
}

compute_diagnostics = True

report = []

for task in df_llc:
    for fit_name, fit in fit_methods.items():
        x, y, s = pl.trim_trace(df_llc, df_loss, task, start_step, end_step)
        result = fit(x, y, model)
    
        # Evaluate
        row = {
            "Dataset": task,
            "Method": fit_name,
        }
        row.update(result.params_dict)
        y_pred = result.f(x)
       
        measures = {k: v(y, y_pred) for k, v in metrics.items()}
        row.update(measures)
        
        if compute_diagnostics:
            row.update(pl.pcov_diagnostics(result, model))

        report.append(row)
        
report = pd.DataFrame(report, index=range(len(report)))
report


# Example 2: Compare extrapolations with different scaling models

In [None]:
# Compare fitting methods

candidate_models = {
    "Power Law": pl.ShiftedPowerLaw,
    "Power Law (4P)": pl.DoubleShiftedPowerLaw,
    "Exponential": pl.ShiftedExponential,
    "Polynomial": pl.PolynomialModel,
}

tasks = ["arxiv", "github", "cc", "stackexchange", "wikipedia_en"]

fit_methods = {
    "fit": pl.min_fit,
    # "ODR": pl.odr_fit,   # can make polynomial go crazy
}

heteroskedastic = [False]  # [False, True]  list options - True for relative error
step_start = 256
step_end = 80000

def holdout_prediction(task, n_hold_out=7):
    fit_method = pl.min_fit
    
    x, y, s = pl.trim_trace(df_llc, df_loss, task, step_start, step_end)
    
    # possibly what matters more is the extrapolation ratio?
    cut = len(x) - n_hold_out
    x_train, y_train, s_train = x[:cut], y[:cut], s[:cut]
    x_test, y_test, s_test = x[cut:], y[cut:], s[cut:]
    
    fig = go.Figure()
    
    # Plot the data
    for fold, xs, ys, ss, col in (
        ("Training Data", x_train, y_train, s_train, 'gray'),
        ("Test Data", x_test, y_test, s_test, 'black')):
        
        fig.add_trace(go.Scatter(
            x=xs, y=ys, customdata=ss,
            mode='markers',
            showlegend=True,
            name=fold,
            marker=dict(color=col, size=8),
            hovertemplate=(
                f"{fold}<br>Task: {task}<br>" +
                "llc: %{x:.2f}<br>loss: %{y:.2f}<br>Step: %{customdata:.0f}<br><extra></extra>"
            )
        ))
    
    
    for fit_name, fit_method in fit_methods.items():
        for model_name, model in candidate_models.items():
            for hs in heteroskedastic:
                result = fit_method(x_train, y_train, model, hs=hs)
                y_pred = result.f(x_test)
                mse = np.mean((y_test - y_pred)**2)
                
                # Plot fit
                x_p = np.linspace(x[0], x[-1], 200)
                y_p = result.f(x_p)
                hs_name = "HS" if hs else ""
                
                fig.add_trace(go.Scatter(
                    x=x_p, y=y_p,
                    mode='lines',
                    name=f"{model_name} {fit_name} {hs_name} (MSE: {mse:.6f}))",
                    showlegend=True,
                    hovertemplate=(
                        pl.dict2txt(result.params_dict)
                    )
                ))
    
    
    # Update layout
    fig.update_layout(
        title=f'Predictions with {n_hold_out} Points Held Out - {task} @ {msize}',
        xaxis_title='LLC / 100',
        yaxis_title='Loss',
        width=900,
        height=600,
        showlegend=True,
    )
    fig.show()

for task in tasks:
    holdout_prediction(task)