# Power law coefficients
## Searching for an interpretation

In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import fitting as fit
import data_utils as dat
import pandas as pd
import vis
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import subprocess

## Coefficient Fits

In [None]:
# be conservative initially
_start = 2000  # we believe this should be in-common


# Is there any pattern over model size?

df_llc, df_loss = dat.load_dfs("160m", data_path="data")  # I believe 160m has a full task list



# m_codes = [
    # ('14m', _start, 20000),
    # ('31m',_start, 20000),
    # ('70m', _start, 20000),
    # ('160m', _start, 30000),
    # ('410m-dense', _start, 80000),
#     ('1b', _start, 30000),
# ]

# m_code, start_step, end_step = ('1b', _start, 30000)
m_code, start_step, end_step = ('160m', _start, 30000)


rs = {}
Ls = {}
msize = m_code.split("-")[0]
df_llc, df_loss = dat.load_dfs(m_code, data_path="data")

tasks = df_llc.columns
colors = vis.assign_cols(tasks)
task_xs = {t:[] for t in tasks}
task_ys = {t:[] for t in tasks}
task_siz = {t:[] for t in tasks}

# Not every model has a full task list
for task in df_llc.columns:
    trace = dat.trim_trace(df_llc, df_loss, task, start_step, end_step)
    result = fit.min_fit(trace.x, trace.y, fit.OffsetPowerLaw)
    pars = result.params_dict
    desc = f"{msize}-{task}"
    rs[task] = x = float(pars["r"])
    Ls[task] = y = float(pars["y*"])
    
    task_xs[task].append(x)
    task_ys[task].append(y)
    task_siz[task].append(msize.upper())


fig = go.Figure()

for task in tasks:

    fig.add_trace(go.Scatter(
        x=task_xs[task],
        y=task_ys[task],
        customdata=task_siz[task],
        marker=dict(
            color=colors[task],
            size=6,
        ),
        mode='markers+lines+text',
        name=task,
        text=task, #task_siz[task],
        textfont=dict(size=8),
        textposition='top right',
        hovertemplate="Model: %{customdata}<br><extra></extra>",
    ))
fig.update_xaxes(title_text="r")
fig.update_yaxes(title_text="L*")
fig.update_layout(title="", width=800, height=600)

fig.show()


In [None]:
# Siphon off the graph results

# Create DataFrame from the dictionaries
coeffs = pd.DataFrame({
    'R': rs,
    'L': Ls
}).reset_index().rename(columns={'index': 'dataset'})
coeffs.set_index("dataset", inplace=True)
coeffs.drop(index=["full"], inplace=True)
coeffs

## Try to link coefficients to the dataset
lossless compression is a good proxy for "memorisation", yes? Look to zip, or facebook's dictionary training zstd.
But would it make more sense to look at the single file compression ratio, or the mutual information with the pile?

Usage:

### Solid compression
`cat arxiv/* | zstd -13 -c > archive.solid.zst`

### Separate file compression
`zstd -13 -c arxiv/* > archive-pieces.zst`

### Extract a (text) dictionary
`zstd --train -o arxiv.dict arxiv/`

### Comptess with the dictionary
`zstd -13 -d arxiv.dict -c arxiv/* > archive-dicted.zst`

In [None]:
import os
os.chdir('samples') 

In [None]:
# Parameters
import subprocess
import pandas as pd
import os
from pathlib import Path

# Train mushes a bunch of samples into a single file that should fit in context
ref = "full"  # "train"  # Which dataset to refer to
#cmd = f"cat {ref}/* > {ref}.dict"  # for full memory!
max_dict="1M"  # Will comfortably fit inside the context window
split = 5000  # it truncates large individual samples with a warning - but we can split them up
cmd = f"zstd --train --maxdict={max_dict} -B{split} -o {ref}.dict {ref}/*"
print(cmd)
subprocess.run(cmd, shell=True);

In [None]:
# Compress each task against the training dictionary
# - one sample at a time so it doesn't peek at context from other samples
pile_composition = {
    "arxiv": 120.71,
    "pile-cc": 243.87,
    "dm_mathematics": 16.63,
    "enron_emails": 1.89,
    "freelaw": 82.39,
    "github": 102.18,
    "hackernews": 8.38,
    "nih_exporter": 4.07,
    "pubmed_abstracts": 41.37,
    "pubmed_central": 193.86,
    "stackexchange": 69.14,
    "uspto_backgrounds": 49.19,
    "wikipedia_en": 20.54,
    # "(missing)books3": 162.61,
    # "(missing)openwebtext2": 134.80,
    # "(missing)gutenberg": 29.20,
    # "(missing)opensubtitles": 20.91,
    # "(missing)ubuntu_irc": 11.84,
    # "(missing)bookcorpus2": 10.15,
    # "(missing)europarl": 9.85,
    # "(missing)youtube_subtitles": 8.02,
    # "(missing)philpapers": 5.11
}


for task in pile_composition:  # don't do full
    print(f"Compressing task: {task}")
    # Do it with and without a dictionary....
    

In [None]:
results = []
level = "13"  # zstd compression level

for task in pile_composition:
    print(f"Compress {task} with dictionary:")
    cmd = f"zstd -{level} -D {ref}.dict -c {task}/* > {task}-{ref}.zst"
    print(cmd)
    assert not subprocess.run(cmd, shell=True).returncode

    print(f"Compress {task} without dictionary:")
    cmd = f"zstd -{level} -c {task}/* > {task}-indiv.zst"
    print(cmd)
    assert not subprocess.run(cmd, shell=True).returncode

    print(f"Compress {task} with solid (inter-sample) patterns:")
    cmd = f"cat {task}/* | zstd -{level} -c > {task}-solid.zst"
    assert not subprocess.run(cmd, shell=True).returncode

In [None]:
# Analyse the file sizes - we're looking for something (anything!!!)
# that correlates with the power law coefficients
MB = 1000000
# Train mushes a bunch of samples into a single file that should fit in context
ref = "full"  # Which dataset to refer to
results = []

for task in pile_composition:
    
    # Get original size of all files in task folder
    cmd = f"du -sb {task}"
    console = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    raw = int(console.stdout.split("\t")[0])  # uncompressed size (~25mb)
    
    # Get filesize stats
    results.append({
        'task': task,
        'baseline': os.path.getsize(f"{task}-indiv.zst") / raw,
        'solid': os.path.getsize(f"{task}-solid.zst") / raw,
        'dictionary': os.path.getsize(f"{task}-{ref}.zst") / raw,
    })
    
# Create dataframe and sort by compression ratio
df = pd.DataFrame(results)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df.set_index("task", inplace=True)
df

In [None]:
tasks = [t for t in pile_composition]
reps = np.array([pile_composition[t] for t in tasks])
reps_s = pd.Series(reps / np.max(reps), index=tasks)
reps_s

In [None]:
# Derived Components
#df["mutual"] = df["indiv"] - df["dict"]  # How much informaion you gain by compressing with the "training" dictionary
df["ineffectiveness"] = df["dictionary"] / df["baseline"]  # How much informaion you gain by compressing with the "training" dictionary
df["effectiveness"] = df["baseline"] / df["dictionary"]
df["irredundancy"] = df["solid"] / df["baseline"]  # How much information you gain by compressing with an "ideal" dictionary 
df["redundancy"] = df["baseline"] / df["solid"]
df["dict advantage"] = df["baseline"] - df["dictionary"]
df["solid advantage"] = df["baseline"] - df["solid"]
# # And consider ones normalised by size in the dataset?
# df["unique_scaled"] = df["mutual"] * reps_s
# df["indiv_scaled"] = df["indiv"] * reps_s
# df["ideal_scaled"] = df["ideal"] * reps_s

# Any patterns?

In [None]:
def correlate(df_a, df_b, method='pearson'):
    
    # Initialize correlation matrix
    correlations = pd.DataFrame(
        index=df_a.columns,
        columns=df_b.columns,
        dtype=float
    )
    
    # Calculate correlations
    for col_a in df_a.columns:
        for col_b in df_b.columns:
            # Handle non-numeric columns
            if not pd.api.types.is_numeric_dtype(df_a[col_a]) or \
               not pd.api.types.is_numeric_dtype(df_b[col_b]):
                correlations.loc[col_a, col_b] = np.nan
                continue
                
            # Calculate correlation
            correlation = df_a[col_a].corr(df_b[col_b], method=method)
            correlations.loc[col_a, col_b] = correlation
            
    return correlations


In [None]:
corrs = correlate(coeffs, df)
print(corrs.to_markdown(
    floatfmt=".2f",  # Format floats to 2 decimal places
))
corrs

In [None]:
import plotly.express as px
import vis
# Create the scatter plot
colors = vis.assign_cols(tasks)


fig = px.scatter(
    x=df["ineffectiveness"],
    y=coeffs["R"],
    #text=tasks,  # Using the common index for annotations
    color=tasks,
    color_discrete_map=colors,
    labels={
        'y': 'R Coefficient',
        'x': 'Dictionary (in)effectiveness',
    }
)

# Adjust text position and layout
fig.update_traces(
    textposition='top center',  # Position text above points
    mode='markers+text'  # Show both markers and text
)
fig.update_layout(width=800, height=600, title="Exponent vs dictionary effectiveness")
# Display the plot
fig.show()


In [None]:
colors = vis.assign_cols(tasks)


fig = px.scatter(
    x=df["ineffectiveness"],
    y=coeffs["L"],
    #text=tasks,  # Using the common index for annotations
    color=tasks,
    color_discrete_map=colors,
    labels={
        'y': 'L Coefficient',
        'x': 'Dictionary (In)effectiveness',
    }
)

# Adjust text position and layout
fig.update_traces(
    textposition='top center',  # Position text above points
    mode='markers+text'  # Show both markers and text
)
fig.update_layout(width=800, height=600, title="L* vs dictionary effectiveness")
# Display the plot
fig.show()

In [None]:
coeffs.corr()


In [None]:
set(compression.index) - set(coeffs.index)

In [None]:
import subprocess
import pandas as pd
import os
import numpy as np

# Dictionary sizes to test (in KB)
sizes = np.exp(np.linspace(np.log(64), np.log(8192), 10)).round().astype(int)
results = []
train_name = "train"

for size in sizes:
    print(f"Processing {size}KB dictionary...")
    size_bytes = size * 1024
    dict_name = f"train_{size}k.zdict"
    archive_name = f"train_{size}k.zst"

    # Train dictionary
    subprocess.run(f"zstd --train --maxdict={size_bytes} -o {dict_name} {train_name}/*", 
                  shell=True, 
                  stderr=subprocess.DEVNULL)

    # Compress files
    subprocess.run(f"zstd -D {dict_name} -c {train_name}/* > {archive_name}", 
                  shell=True, 
                  stderr=subprocess.DEVNULL)

    # Get sizes
    dict_size = os.path.getsize(dict_name)
    compressed_size = os.path.getsize(archive_name)
    total_size = dict_size + compressed_size

    results.append({
        'dict_size_kb': size,
        'dict_size_bytes': dict_size,
        'compressed_size_bytes': compressed_size,
        'total_size_bytes': total_size
    })

    # Cleanup
    os.remove(dict_name)
    os.remove(archive_name)

# Create dataframe and display results
df = pd.DataFrame(results)
df

In [None]:
import subprocess
from pathlib import Path
import shutil

MB = 1e6


def get_zip_size(name, base=None, compression_level=9):
    """Get size of zip file containing given files"""
    in_file = os.path.join(data_dir, name + ".dat")
    
    if base:
        # copy over the base first.... (easier than recompressing...)
        base_file = os.path.join(data_dir, base + ".xz")
        assert os.path.exists(base_file)
        # shutil.copy(base_file, zip_file)  # compress it once, and reuse later!
        name = f"{name}+{base}"
    else:
        base_file = ""
        
    zip_file = os.path.join(data_dir, name + ".xz")
    
    if not os.path.exists(zip_file): 
        print("Compressing " + name)
        # The "ms" is crucial - look up "solid" compression, one is an archive of zips, the other is a zip of archives
        subprocess.run(f"cat {base_file} {in_file} | xz -6 -e > {zip_file}", shell=True, check=True)
        # cmd = ['7z', 'a', '-ms=on', f'-mx={compression_level}', zip_file, in_file, base_file]
        # print(" ".join(cmd))
        # subprocess.run(cmd)  #, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    return os.path.getsize(zip_file)
    

def analyze_compression(task, holdout):
    """Analyze compression metrics for a single task file"""
    # Get raw size directly from filesystem
    task_file = os.path.join(data_dir, task + ".dat")
    raw_size = os.path.getsize(task_file)
    
    # Get compressed sizes
    compressed_size = get_zip_size(task)
    holdout_size = get_zip_size(holdout)
    mutual_size = get_zip_size(task, base=holdout)
    
    # Calculate mutual compressed size (additional size beyond holdout)
    mutual_compressed_size = mutual_size - holdout_size
    
    # Calculate ratios
    compression_ratio = compressed_size / raw_size
    mutual_compression_ratio = mutual_compressed_size / raw_size
    
    return {
        'raw_size': raw_size / MB,
        'compressed_size': compressed_size / MB,
        'mutual_compressed_size': mutual_compressed_size / MB,
        'compression_ratio': compression_ratio,
        'mutual_compression_ratio': mutual_compression_ratio
    }

# improvement:
# shutil.copy(holdout_zip, mutual_zip)
# subprocess.run(['zip', '-9', mutual_zip, task_file], 

results = {}
for task in tasks[:-1]:  # there is no "Full" task
    out = results[task] = analyze_compression(task, "holdout")
    print(out)
    
# Create DataFrame
df = pd.DataFrame.from_dict(results, orient='index')


# Round ratios
df['compression_ratio'] = df['compression_ratio'].round(3)
df['mutual_compression_ratio'] = df['mutual_compression_ratio'].round(3)
df

In [None]:

tasks

In [None]:
from tqdm.notebook import tqdm
import os
import time
for i in tqdm(range(20), total=20):
    time.sleep(.1)
    

In [None]:
len(batch['text'])


In [None]:
assert(False)

In [None]:
def get_size(fname):
    parquet_file = pq.ParquetFile(fname)
    metadata = parquet_file.metadata

    compressed = 0
    raw = 0
    # Get total size and rows
    total_rows = metadata.num_rows
    num_row_groups = metadata.num_row_groups
    
    # Print overall statistics
    # print(f"Total rows: {total_rows:,}")
    # print(f"Number of row groups: {num_row_groups}")
    # Get details for each row group
    for i in range(num_row_groups):
        row_group = metadata.row_group(i)
        for j in range(row_group.num_columns):
            col = row_group.column(j)
            compressed += col.total_compressed_size
            raw += col.total_uncompressed_size

    return compressed, raw

In [None]:
from huggingface_hub import HfApi, hf_hub_download
import os
from tqdm import tqdm
import pyarrow.parquet as pq

# Create directories if they don't exist
zip_dir = "dataset_zips"
os.makedirs(zip_dir, exist_ok=True)

# Initialize Hugging Face API
api = HfApi()
datasets = api.list_datasets(search="timaeus/pile-")

results = []


for dataset in datasets:
    dataset_name = dataset.id   
    task = dataset_name.split("pile-")[1]
    if task not in tasks:
        # Only get the files
        continue

    
    # Get list of files
    files = api.list_repo_files(dataset_name, repo_type="dataset")
    files = [f for f in files if not (f.endswith('.json') or f.endswith('.md') or f == '.gitattributes')]
    if not(files):
        continue

    infos = api.get_paths_info(repo_id=dataset_name, paths=files, repo_type="dataset")
    total_size = sum([f.size for f in infos])
    
    # Download files until we reach target size or run out of files
    downloaded_files = []
    current_size = 0
    for file in files:
        local_file = hf_hub_download(
            repo_id=dataset_name,
            filename=file,
            repo_type="dataset"
        )
        file_size = os.path.getsize(local_file)
        if file_size > 1024:  # Skip tiny files
            downloaded_files.append(local_file)
            current_size += file_size
            #print(f"Added {file}: {file_size / (1024*1024):.2f} MB")
        
        if current_size >= TARGET_SAMPLE_SIZE:
            break    
   

    snappy = 0
    zippy = 0
    raw = 0
    
    for f in downloaded_files:
        # print(f"Loading {f}")

        c, r = get_size(f)
        snappy += c
        raw += r
        
        f2 = f[:-4] + "_zip.parquet"

        if not os.path.exists(f2):
            print("Recompressing")
            table = pq.read_table(f)
            pq.write_table(table, f2, compression="gzip", compression_level=8)  # max is 9
            del table

        c2, r2 = get_size(f2)
        zippy += c2
        print(r, r2)
       
        
    
    item = {
        'task': task,
        'uncompressed_size_mb': raw / (1024*1024),
        'compressed_snappy_mb': snappy / (1024*1024),
        'compressed_zip_mb': zippy / (1024*1024),
        'zip_ratio': zippy / raw,
        'snappy_ratio': snappy / raw,
        'files_analyzed': len(downloaded_files),
        'total_files': len(files),
        'snappy_size': total_size,        
    }
    results.append(item)
    
    print(dataset_name)
    print(f"snappy ratio: {snappy / raw:.0%}")
    print(f"zipped ratio: {zippy / raw:.0%}")
    print(f"snappy size: {total_size/1024/1024:.1f}MB")


# Convert results to DataFrame
sizes = pd.DataFrame(results).set_index("task")

In [None]:
datasets = api.list_datasets(search="timaeus/pile-")

results = []

for dataset in datasets:
    print(dataset.id)

In [None]:
# perhaps i'm measuring the wrong thing here - its more like if you train on everything, how much to memorise this specific dataset?
# which is like making a zip of half the data, then looking at adding the new dataset to the mix
# mutual information.. pointing the same direction
# like resolution of compressibility
result = sizes.join(coefs)

fig = go.Figure()
# xcol = "compressed_zip_mb"
xcol = "zip_ratio"
ycol = "r"
fig.add_trace(
    go.Scatter(x=result[xcol], y=result[ycol], mode='markers',
               customdata=result.index,
               hovertemplate='%{customdata}<br>' + xcol + ': %{x}<br>' + ycol + ': %{y}<extra></extra>'
              )
)

fig.update_layout(
    xaxis_title=xcol,
    yaxis_title=ycol,
    #xaxis_type="log",  # log scale for x axis
    #yaxis_type="log"   # log scale for y axis
    width=400,
    height=400,
)
fig.show()

## Appendix: Loss Scaling

A standard "scaling law" for loss that I've seen (eg. in Hoffmann et al. 2022 Training compute-optimal large language models, and Choshen et al. A Hitchiker's guide to scaling law estimation) is of the form:

#### Across model size
$ L - L_0 = \frac{A}{P^{\alpha}} + \frac{B}{T^{\alpha}} $
Note this 5 parameter model is wrt size and time - it is a 3 parameter model wrt time.

#### Fixed model size
$ (L - L_0) = c T^{-r}$, and is in terms of step (T) rather than LLC, and it assumes $T_0 = 0$

T = steps gets a bit handwavy if the learning rate changes, but Choshen et al observed there is still some value to it. 

In [None]:

fig = go.Figure()
xlabel = "step"
ylabel = "loss"

def L(T, params):
    L0 = np.exp(params[0])  # log-parameters required for convergence
    c = np.exp(params[1])
    r = params[2]
    return np.maximum(1e-8, L0 + c / (T ** r))

# Record the per-task parameters
params_list = []

# Iterate through corresponding pairs of columns
for task, col in zip(tasks, colors):

    # plot as we go    
    fig.add_trace(        
        go.Scatter(
            x=df_loss_trim.index.values,
            y=df_loss_trim[task].values,
            mode='markers', # 'lines+markers',
            name=task,
            line=dict(color=col),
            marker=dict(color=col),
            hovertemplate=(
                f"Task: {task}<br>" +
                xlabel + ": %{x}<br>" +
                ylabel + ": %{y}<br>" +
                "<extra></extra>"
            ),
            customdata=df_loss.index,
        )
    )
    
    # Fit a power law
    x = df_loss_trim.index.values.astype(float)
    y = df_loss_trim[task].values
    
    def loss(params):
        est = L(x, params)
        #return np.sum((est - y)**2)
        return np.sum((np.log(est) - np.log(y))**2)

    # heuristic for optimisation that's not crazy
    par0 = [1., 6., 1.]
    res = minimize(loss, par0, method="L-BFGS-B", jac=grad(loss))
    par = res.x
    
    L0, c, r = par
    params_list.append({
        'logL0': L0,
        'logc': c,
        'r': r,
    })

    xp = np.exp(np.linspace(np.log(x.min()), np.log(x.max()), 100))
    f = L(xp, par)
    
    fig.add_trace(        
        go.Scatter(
            x=xp,
            y=f,
            mode='lines',
            name=task + "power law",
            line=dict(color=col),
            hovertemplate=(
                f"Task: {task}<br>" +
                f"L0: {L0}<br>" +
                f"c: {c}<br>" +
                f"r: {r}<br>" +
                "<extra></extra>"
            ),
            customdata=df_loss_trim.index,
        )
    )
    

# Update layout
fig.update_layout(
    xaxis_title=xlabel,
    yaxis_title=ylabel,
    xaxis_type="log",
    yaxis_type="log",
    title="Loss scaling",
    showlegend=True,
    width=800,
    height=600
)

fig.show()

results = pd.DataFrame(params_list)
results.index = tasks
results

In [None]:
# # First, collect a bunch of samples from the "full" dataset
# # It might also be interesting to know the dictionary size...
# dataset_name = "timaeus/dsir-pile-1m-2"
# key = "contents"
# dataset = load_dataset(dataset_name, token=token)["train"]  # are they already shuffled?
# count = 0
# target = 1000
# data_dir = "datasets/samples"
# # samples 21 thru 24 are consecutive, telling me that its not shuffled at all....

# os.makedirs(data_dir, exist_ok=True)
# for batch in tqdm(dataset.iter(batch_size=500), total=2000):
#     for text in batch[key]:
#         with open(f"{data_dir}/sample_{count:03d}", "w") as f:
#             f.write(text)
#         count += 1
#         break
