* "Demystifying Floating Point Precision", https://oreil.ly/uCYYl
* Maarten Grootendors, "A Visual Guide to Quantization", https://oreil.ly/bpi3b

In [8]:
import numpy as np
import torch
import pandas as pd

In [14]:
# Define the original numbers
numbers = np.array([2.3888888, 0, 34.444, 12.3486e4, -1223.4566], dtype=np.float32)

numbers

array([ 2.3888888e+00,  0.0000000e+00,  3.4444000e+01,  1.2348600e+05,
       -1.2234565e+03], dtype=float32)

In [28]:
# Convert numbers to different data types
numbers_fp32 = torch.tensor(numbers, dtype=torch.float32)
numbers_fp16 = numbers_fp32.to(torch.float16)
numbers_bf16 = numbers_fp32.to(torch.bfloat16)
numbers_int8 = numbers_fp32.to(torch.int8)


In [35]:
def arithmetic_operations(tensor):
    sum_val = tensor.sum().item()
    product_val = tensor.prod().item()
    mean_val = sum_val / tensor.numel()

    return {
        "sum": sum_val,
        "product": product_val,
        "mean": mean_val
    }

In [38]:
# Compute results for each format
results = {
    "float32": arithmetic_operations(numbers_fp32),
    "float16": arithmetic_operations(numbers_fp16),
    "bfloat16": arithmetic_operations(numbers_bf16),
    "int8": arithmetic_operations(numbers_int8)
}

# Create DataFrame to compare precision loss
df_results = pd.DataFrame(results).T

df_results

Unnamed: 0,sum,product,mean
float32,122299.375,-0.0,24459.875
float16,inf,,inf
bfloat16,122368.0,-0.0,24473.6
int8,187.0,0.0,37.4


In [39]:
# Compute absolute errors compared to float32
errors = df_results - df_results.loc["float32"]

# Display results
print("Quantization Effect on Arithmetic Precision:")
print(df_results)
print("\nPrecision Loss at Each Quantization Level:")
print(errors)


Quantization Effect on Arithmetic Precision:
                 sum  product       mean
float32   122299.375     -0.0  24459.875
float16          inf      NaN        inf
bfloat16  122368.000     -0.0  24473.600
int8         187.000      0.0     37.400

Precision Loss at Each Quantization Level:
                 sum  product       mean
float32        0.000      0.0      0.000
float16          inf      NaN        inf
bfloat16      68.625      0.0     13.725
int8     -122112.375      0.0 -24422.475
