<!-- # Copyright (c) 2024 Graphcore Ltd. All rights reserved. -->

# Collect value statistics for formats

This notebook presents various statistics for a variety of float formats.
Some of these are present on the `FormatInfo` class, and are presented for all formats.
Others are obtained by enumerating all values (only for the =16 bit formats). 

## Statistics obtained from FormatInfo

 - name: Format
 - B: Bits in the format
 - P: Precision in bits
 - E: Exponent field width in bits
 - T: Trailing significand field width in bits
 - max: Largest finite value
 - min: Most negative value (typically the same, unless twos complement)
 - smallest: Smallest positive value
 - smallest_normal: Smallest positive normal value, NaN if all finite values are subnormal
 

In [1]:
%run utils.py
D = pandas_render  # from utils
import pandas
from functools import partial
from gfloat import *
from gfloat.formats import *

import numpy as np


# Special rendering for float values -
# if they don't round-trip in 8.5g, prepend with "≈", or render as rational
def render_float(approx: bool, v):
    if not isinstance(v, float):
        return str(v)

    if np.isnan(v):
        return "n/a"

    s = f"{v:8.5g}"
    if float(s) == v:
        return s

    if approx:
        return "≈" + s
    else:
        return float_pow2str(v)


def collect_stats(fi: FormatInfo):
    return dict(
        name=fi.name,
        B=fi.bits,
        P=fi.precision,
        E=fi.expBits,
        smallest=fi.smallest,
        smallest_normal=fi.smallest_normal if not fi.is_all_subnormal else np.nan,
        max=fi.max,
        num_nans=float(fi.num_nans),
        infs=2 if fi.has_infs else 0,
    )


stats = [collect_stats(fi) for fi in all_formats]
df = pandas.DataFrame(stats)
D(df, format=partial(render_float, True))

name,B,P,E,smallest,smallest_normal,max,num_nans,infs
ocp_e2m1,4,2,2,0.5,1,6,0,0
ocp_e2m3,6,4,2,0.125,1,7.5,0,0
ocp_e3m2,6,3,3,0.0625,0.25,28,0,0
ocp_e4m3,8,4,4,≈0.0019531,0.015625,448,2,0
ocp_e5m2,8,3,5,≈1.5259e-05,≈6.1035e-05,57344,6,2
p3109_p1,8,1,7,≈2.1684e-19,≈2.1684e-19,≈9.2234e+18,1,2
p3109_p2,8,2,6,≈2.3283e-10,≈4.6566e-10,≈2.1475e+09,1,2
p3109_p3,8,3,5,≈7.6294e-06,≈3.0518e-05,49152,1,2
p3109_p4,8,4,4,≈0.00097656,0.0078125,224,1,2
p3109_p5,8,5,3,0.0078125,0.125,15,1,2



## Statistics computed by exhaustive inspection

 - lt1: Number of values x such that `0 < x < 1`
 - gt1: Number of values x such that `1 < x < Inf`
 - rt16: True if all values are exactly representable in IEEE binary16
 - min/maxSubnormal: Smallest/largest subnormal value, "n/a" if no values are subnormal
 - min/maxNormal: Smallest/largest normal value, "n/a" if no values are normal


In [2]:
def compute_stats(fi: FormatInfo):
    # Generate all values
    values = [decode_float(fi, i) for i in range(2**fi.bits)]
    df = pandas.DataFrame(values)

    # Compute statistics: lt1,gt1
    fval = df["fval"]
    total_01 = fval.between(0, 1, inclusive="neither").sum()
    total_1Inf = fval.between(1, np.inf, inclusive="neither").sum()

    # Compute statistics: maxFinite,minFinite
    finite_vals = fval[np.isfinite(fval)]
    maxFinite = finite_vals.loc[finite_vals.idxmax()]
    minFinite = finite_vals.loc[finite_vals.idxmin()]
    assert maxFinite == fi.max
    assert minFinite == fi.min

    # Compute statistics: maxNormal,minNormal
    normal_vals = fval[(df["fclass"] == FloatClass.NORMAL) & (fval > 0)]
    maxNormal = normal_vals.loc[normal_vals.idxmax()] if normal_vals.any() else np.nan
    minNormal = normal_vals.loc[normal_vals.idxmin()] if normal_vals.any() else np.nan
    assert np.isnan(maxNormal) or maxNormal == fi.max
    assert np.isnan(minNormal) or minNormal == fi.smallest_normal

    # Compute statistics: minSubnormal
    pos_subnormal = fval[(df["fclass"] == FloatClass.SUBNORMAL) & (fval > 0)]
    maxSubnormal = (
        pos_subnormal.loc[pos_subnormal.idxmax()] if pos_subnormal.any() else np.nan
    )
    minSubnormal = (
        pos_subnormal.loc[pos_subnormal.idxmin()] if pos_subnormal.any() else np.nan
    )
    assert np.isnan(minSubnormal) or minSubnormal == fi.smallest_subnormal

    assert np.nanmin([minSubnormal, minNormal]) == fi.smallest

    # Compute roundtrips: rt16, rt32
    with np.errstate(over="ignore"):
        rt16 = (np.float64(np.float16(fval)) == np.float64(fval)) | ~np.isfinite(fval)
        rt32 = (np.float64(np.float32(fval)) == np.float64(fval)) | ~np.isfinite(fval)

    rt16 = rt16.all()
    rt32 = rt32.all()
    assert rt32  # If not, we should include rt32 in the table

    # Assemble tuple
    return dict(
        name=fi.name,
        B=fi.bits,
        P=fi.precision,
        E=fi.expBits,
        rt16=rt16,
        lt1=total_01,
        gt1=total_1Inf,
        minSubnormal=minSubnormal,
        maxSubnormal=maxSubnormal,
        minNormal=minNormal,
        maxNormal=maxNormal,
    )


stats = [compute_stats(fi) for fi in all_formats if fi.bits <= 16]
df2 = pandas.DataFrame(stats)
D(df2, format=partial(render_float, True))

name,B,P,E,rt16,lt1,gt1,minSubnormal,maxSubnormal,minNormal,maxNormal
ocp_e2m1,4,2,2,True,1,5,0.5,0.5,1,6
ocp_e2m3,6,4,2,True,7,23,0.125,0.875,1,7.5
ocp_e3m2,6,3,3,True,11,19,0.0625,0.1875,0.25,28
ocp_e4m3,8,4,4,True,55,70,≈0.0019531,≈0.013672,0.015625,448
ocp_e5m2,8,3,5,True,59,63,≈1.5259e-05,≈4.5776e-05,≈6.1035e-05,57344
p3109_p1,8,1,7,False,62,63,,,≈2.1684e-19,≈9.2234e+18
p3109_p2,8,2,6,False,63,62,≈2.3283e-10,≈2.3283e-10,≈4.6566e-10,≈2.1475e+09
p3109_p3,8,3,5,True,63,62,≈7.6294e-06,≈2.2888e-05,≈3.0518e-05,49152
p3109_p4,8,4,4,True,63,62,≈0.00097656,≈0.0068359,0.0078125,224
p3109_p5,8,5,3,True,63,62,0.0078125,≈ 0.11719,0.125,15


### Emit the same table, but with exact values

In this table, float values are printed as decimals, unless the decimals are not an
exact representation of the value, in which case, they are printed as rationals (between 1 and 2) times 2^E.

In [3]:
D(df2, format=partial(render_float, False))

name,B,P,E,rt16,lt1,gt1,minSubnormal,maxSubnormal,minNormal,maxNormal
ocp_e2m1,4,2,2,True,1,5,0.5,0.5,1,6
ocp_e2m3,6,4,2,True,7,23,0.125,0.875,1,7.5
ocp_e3m2,6,3,3,True,11,19,0.0625,0.1875,0.25,28
ocp_e4m3,8,4,4,True,55,70,2^-9,7/4*2^-7,0.015625,448
ocp_e5m2,8,3,5,True,59,63,2^-16,3/2*2^-15,2^-14,57344
p3109_p1,8,1,7,False,62,63,,,2^-62,2^63
p3109_p2,8,2,6,False,63,62,2^-32,2^-32,2^-31,2^31
p3109_p3,8,3,5,True,63,62,2^-17,3/2*2^-16,2^-15,49152
p3109_p4,8,4,4,True,63,62,2^-10,7/4*2^-8,0.0078125,224
p3109_p5,8,5,3,True,63,62,0.0078125,15/8*2^-4,0.125,15


## Tables in RST/Markdown

These are used to generate gfloat documentation, but may be of use in other
contexts so left here.

In [4]:
from tabulate import tabulate

dfstr = df.map(lambda x: render_float(True, x))
print(
    tabulate(dfstr, df.columns, tablefmt="rst", showindex=False).replace(" nan", " n/a")
)

name        B    P    E  smallest     smallest_normal    max           num_nans       infs
ocp_e2m1    4    2    2  0.5          1                  6             0                 0
ocp_e2m3    6    4    2  0.125        1                  7.5           0                 0
ocp_e3m2    6    3    3  0.0625       0.25               28            0                 0
ocp_e4m3    8    4    4  ≈0.0019531   0.015625           448           2                 0
ocp_e5m2    8    3    5  ≈1.5259e-05  ≈6.1035e-05        57344         6                 2
p3109_p1    8    1    7  ≈2.1684e-19  ≈2.1684e-19        ≈9.2234e+18   1                 2
p3109_p2    8    2    6  ≈2.3283e-10  ≈4.6566e-10        ≈2.1475e+09   1                 2
p3109_p3    8    3    5  ≈7.6294e-06  ≈3.0518e-05        49152         1                 2
p3109_p4    8    4    4  ≈0.00097656  0.0078125          224           1                 2
p3109_p5    8    5    3  0.0078125    0.125              15            1                 2

In [5]:
from tabulate import tabulate

dfstr = df.map(lambda x: render_float(False, x))
print(tabulate(dfstr, df.columns, tablefmt="rst", showindex=False))

name        B    P    E  smallest     smallest_normal    max                                       num_nans                                  infs
ocp_e2m1    4    2    2  0.5          1                  6                                         0                                            0
ocp_e2m3    6    4    2  0.125        1                  7.5                                       0                                            0
ocp_e3m2    6    3    3  0.0625       0.25               28                                        0                                            0
ocp_e4m3    8    4    4  2^-9         0.015625           448                                       2                                            0
ocp_e5m2    8    3    5  2^-16        2^-14              57344                                     6                                            2
p3109_p1    8    1    7  2^-62        2^-62              2^63                                      1                        

In [6]:
print(df.to_markdown())

|    | name     |   B |   P |   E |     smallest |   smallest_normal |              max |       num_nans |   infs |
|---:|:---------|----:|----:|----:|-------------:|------------------:|-----------------:|---------------:|-------:|
|  0 | ocp_e2m1 |   4 |   2 |   2 | 0.5          |      1            |     6            |    0           |      0 |
|  1 | ocp_e2m3 |   6 |   4 |   2 | 0.125        |      1            |     7.5          |    0           |      0 |
|  2 | ocp_e3m2 |   6 |   3 |   3 | 0.0625       |      0.25         |    28            |    0           |      0 |
|  3 | ocp_e4m3 |   8 |   4 |   4 | 0.00195312   |      0.015625     |   448            |    2           |      0 |
|  4 | ocp_e5m2 |   8 |   3 |   5 | 1.52588e-05  |      6.10352e-05  | 57344            |    6           |      2 |
|  5 | p3109_p1 |   8 |   1 |   7 | 2.1684e-19   |      2.1684e-19   |     9.22337e+18  |    1           |      2 |
|  6 | p3109_p2 |   8 |   2 |   6 | 2.32831e-10  |      4.65661e-10  |  