In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import os
import json
from enum import Enum
from libfb.py.pyre import none_throws

import torch
import transformers
import torch.nn as nn
import numpy as np

import callm.core.utils.utils as utils
from callm.metaformers.src.args.trainer import TrainerArgs
from callm.core.data import datautils
from callm.core.model_utils import get_mp_rank_size, get_consolidated_ckpt_path, ElasticQuantBinarizerSigned, get_torch_dtype
from callm.core.models.llama_xl.transformer import (
    Transformer,
    TransformerForCausalLM,
    TransformerForSequenceClassification,
    wrap_model,
    wrap_model_pt,
)
import llama_xl.quantized_transformer as quantized_transformer
from callm.core.utils.process_args import (
    QAT,
)
from fairscale.nn.model_parallel import initialize as fs_init

In [None]:
bit = 1; steps = "10k"; alpha=0.0; num_batches=10; eval_embedding = False

thetas_list = []
weights_list = []
grid_min = +np.inf; grid_max = -np.inf
lam_mins, lam_maxs = [], []
for i in range(num_batches):
    if eval_embedding:
        grid = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}_embedding_layer/grid_{i}.npy")
        dens = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}_embedding_layer/density_{i}.npy")
        thetas_all = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}_embedding_layer/thetas_{i}.npy")
        weights_all = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}_embedding_layer/weights_{i}.npy")
        lam_min_max = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}_embedding_layer/min_and_max_{i}.npy")
    else:
        grid = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/grid_{i}.npy")
        dens = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/density_{i}.npy")
        thetas_all = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/thetas_{i}.npy")
        weights_all = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/weights_{i}.npy")
        lam_min_max = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/min_and_max_{i}.npy")
    thetas_list.append(thetas_all)
    weights_list.append(weights_all)

    grid_min = min(thetas_all[weights_all>1e-8].min(), grid_min)
    grid_max = max(thetas_all[weights_all>1e-8].max(), grid_max)
    lam_mins.append(thetas_all[weights_all>1e-8].min())
    lam_maxs.append(lam_min_max.max())

thetas_all = np.concatenate(thetas_list, axis=0)
weights_all = np.concatenate(weights_list, axis=0)/(20) # normalize the weights all to be sum to one

if eval_embedding or (bit == 4 and steps==0):
    weights_all[weights_all>1e-3] = weights_all[weights_all>1e-3]/num_batches
else:
    weights_all = weights_all/num_batches
    grid_min = thetas_all[weights_all>1e-8].min()
    grid_max = thetas_all[weights_all>1e-8].max()

# Construct grid with a small margin
pad = 0 # 0.05 * (grid_max - grid_min + 1e-12)
grid = np.linspace(grid_min - pad, grid_max + pad, 50)
sigma = 1e-3
interval = grid[1] - grid[0]

# Accumulate Gaussian kernels on the grid
density = np.zeros_like(grid, dtype=np.float64)
frequency = np.zeros_like(grid, dtype=np.float64)
for i in range(len(thetas_all)):
    theta = thetas_all[i]
    w = weights_all[i]

    # Gaussian accumulation
    for ti, wi in zip(theta, w):
        density += (
            wi
            * np.exp(-0.5 * ((grid - ti) / sigma) ** 2)
            / (np.sqrt(2 * np.pi) * sigma)
        )

        frequency += wi * (np.logical_and(grid>(ti - interval), grid<(ti))).astype(np.float64)

density /= len(thetas_all)
lam_min = float(np.median(lam_mins))
lam_max = float(np.median(lam_maxs))
print(np.mean(lam_mins), np.std(lam_mins))
print(frequency.max())
print(np.log(frequency.max()))
print(grid_min, grid_max, np.std(lam_mins))
print(frequency.sum())

In [None]:
import matplotlib.pyplot as plt
plt.style.use('default')

# Enable LaTeX rendering
# plt.rcParams['text.usetex'] = True
plt.rcParams.update({
    # Regular text fonts
    'font.family': 'serif',
    'font.serif': ['Times New Roman', 'DejaVu Serif', 'Liberation Serif'],

    # Math text fonts - this is the key part!
    'mathtext.fontset': 'stix',  # or 'stixsans', 'cm', 'dejavuserif'
    'mathtext.rm': 'serif',      # Roman (normal) math text
    'mathtext.it': 'serif:italic',  # Italic math text
    'mathtext.bf': 'serif:bold',    # Bold math text
})

# Plot density
# --- Plot the estimated spectral density ---
plt.figure(figsize=(9,5.5)) # (6.5,6.5) # (9,5.5)
if eval_embedding: min_val = 15e-9
else: min_val = 3e-9
frequency[np.log(frequency)<np.log(min_val/2)] = min_val
color = {"16": "tomato", "4": "tomato", "3": "tomato", "2": "royalblue", "1": "forestgreen"}
# plt.scatter(grid, np.log(frequency), color=color[str(bit)])

heights = np.log(frequency)
min_heights = np.log(min_val/2)
heights = heights - min_heights
# plt.scatter(grid, heights, color=color[str(bit)])
width = (grid[1] - grid[0])*0.8
plt.bar(grid+(grid[1] - grid[0])*0.5, heights, width=width, color=color[str(bit)])

plt.axvline(grid_min, linestyle="--", label="Estimated"+r"$~\lambda_\min$", color='black', lw=4)
plt.axvline(grid_max+width*1.5, linestyle="dotted", label="Estimated"+r"$~\lambda_\max$", color='black', lw=4)
if eval_embedding:
    plt.yticks(np.concatenate([np.arange(-18.6, -1.0, 5), np.array([1.5])]) - min_heights, [r"$10^{-8}$", r"$10^{-6}$", r"$10^{-4}$", r"$10^{-2}$", r"$1$"], fontsize=32)
    plt.ylim(-20 - min_heights, 14 - min_heights)
else:
    plt.yticks(np.concatenate([np.arange(-20.3, -1.0, 5), np.array([2.3])]) - min_heights, [r"$10^{-9}$", r"$10^{-7}$", r"$10^{-5}$", r"$10^{-3}$", r"$1$"], fontsize=32)
    plt.ylim(-21.5 - min_heights, 17 - min_heights)
plt.xticks(np.arange(-1000, 1001, 500), fontsize=32) # np.arange(-6, 6.1, 2), np.arange(-0, 201, 50), np.arange(-400, 601, 200), np.arange(-1000, 1001, 1000)
# plt.xlim(-7.5, 7.5)
plt.xlabel("Eigenvalues", fontsize=32)
plt.ylabel("Probability mass", fontsize=32)
plt.title(f"Step 0 of {bit}-bit QAT", fontsize=32)
if steps == "0":
    plt.legend(fontsize=26, ncol=1, loc="upper center")
# Customized grid
plt.grid(True,
         alpha=0.7,          # Transparency
         color='gray')       # Color
plt.tight_layout()
plt.savefig(f"xx/notebooks/figures/hessian_spectrum_{bit}_bit_steps_{steps}" + ("_embedding_layer" if eval_embedding else "") + ".pdf")
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.style.use('default')

# Enable LaTeX rendering
# plt.rcParams['text.usetex'] = True
plt.rcParams.update({
    # Regular text fonts
    'font.family': 'serif',
    'font.serif': ['Times New Roman', 'DejaVu Serif', 'Liberation Serif'],

    # Math text fonts - this is the key part!
    'mathtext.fontset': 'stix',  # or 'stixsans', 'cm', 'dejavuserif'
    'mathtext.rm': 'serif',      # Roman (normal) math text
    'mathtext.it': 'serif:italic',  # Italic math text
    'mathtext.bf': 'serif:bold',    # Bold math text
})

# Plot density
# --- Plot the estimated spectral density ---
plt.figure(figsize=(9,4.5))
frequency[np.log(frequency)<-20] = 1e-9
color = {"4": "tomato", "3": "tomato", "2": "royalblue", "1": "forestgreen"}
# plt.scatter(grid, np.log(frequency), color=color[str(bit)])

heights = np.log(frequency)
min_heights = np.log(1.5e-9)
heights = heights - min_heights
# plt.scatter(grid, heights, color=color[str(bit)])
width = (grid[1] - grid[0])*0.8
plt.bar(grid+(grid[1] - grid[0])*0.5, heights, width=width, color=color[str(bit)])

plt.axvline(grid_min, linestyle="--", label="Estimated"+r"$~\lambda_\min$", color='black', lw=4)
plt.axvline(grid_max+width*1.5, linestyle="-.", label="Estimated"+r"$~\lambda_\max$", color='black', lw=4)
plt.yticks(np.concatenate([np.arange(-20.3, -1.0, 5), np.array([2.3])]) - min_heights, [r"$10^{-9}$", r"$10^{-7}$", r"$10^{-5}$", r"$10^{-3}$", r"$1$"], fontsize=32)
plt.ylim(-21.5 - min_heights, 5 - min_heights)
plt.xticks(np.arange(-8, 8.1, 2), fontsize=32) # np.arange(-8, 8.1, 2),
plt.xlim(-9, 9) # plt.xlim(-13, 15) # plt.xlim(-9, 9)
plt.xlabel("Eigenvalues", fontsize=32)
plt.ylabel("Probability", fontsize=32)
plt.title(f"Step {steps[:-1]}K of {bit}-bit QAT", fontsize=32)
# Customized grid
plt.grid(True,
         alpha=0.7,          # Transparency
         color='gray')       # Color
plt.tight_layout()
plt.savefig(f"xx/notebooks/figures/hessian_spectrum_{bit}_bit_steps_{steps}" + ("_embedding_layer" if eval_embedding else "") + ".pdf")
plt.show()

In [None]:
np.exp(np.arange(-20, 2.0, 4.5))
# np.arange(-20, 2.0, 4.5)

In [None]:
import matplotlib.pyplot as plt

# Plot density
# --- Plot the estimated spectral density ---
plt.figure(figsize=(7,4))
plt.scatter(grid, np.log(dens), label="SLQ density estimate")
plt.axvline(lam_min, linestyle="--", label="~ λ_min (Ritz median)")
plt.axvline(lam_max, linestyle="--", label="~ λ_max (Ritz median)")
# plt.xlim(lam_min, lam_max)
plt.ylim(-200, 1)
plt.xlabel("Eigenvalue λ")
plt.ylabel("Estimated density ρ(λ)")
plt.title("Hessian spectrum (SLQ approximation)")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
bit = 1; steps = "80k"; alpha=0.0; num_batches=10

zero_eigenvalue_frequency = []
# grid_min = +np.inf; grid_max = -np.inf
lam_mins, lam_maxs = [], []
for i in range(num_batches):
    grid = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/grid_{i}.npy")
    dens = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/density_{i}.npy")
    thetas_all = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/thetas_{i}.npy")
    weights_all = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/weights_{i}.npy")
    lam_min_max = np.load(f"xx/notebooks/hessian_spectrum/{bit}_bits_steps_{steps}_alpha_{alpha}/min_and_max_{i}.npy")

    # grid_min = min(grid.min(), grid_min)
    # grid_max = max(grid.max(), grid_max)
    lam_mins.append(lam_min_max.min())
    lam_maxs.append(lam_min_max.max())

    weights_all = weights_all/(20) # normalize the weights all to be sum to one

    mask = np.abs(thetas_all) < 1.5e-3
    frequency = weights_all[mask].sum()
    print(frequency.sum())
    zero_eigenvalue_frequency.append(frequency)

    # # Construct grid with a small margin
    # pad = 0 # 0.05 * (grid_max - grid_min + 1e-12)
    # grid = np.linspace(grid_min - pad, grid_max + pad, 200)
    # sigma = 1e-3
    # interval = grid[1] - grid[0]

    # # Accumulate Gaussian kernels on the grid
    # density = np.zeros_like(grid, dtype=np.float64)
    # frequency = np.zeros_like(grid, dtype=np.float64)
    # for i in range(len(thetas_all)):
    #     theta = thetas_all[i]
    #     w = weights_all[i]

    #     # Gaussian accumulation
    #     for ti, wi in zip(theta, w):
    #         density += (
    #             wi
    #             * np.exp(-0.5 * ((grid - ti) / sigma) ** 2)
    #             / (np.sqrt(2 * np.pi) * sigma)
    #         )

    #         frequency += wi * (np.logical_and(grid>(ti - interval), grid<(ti))).astype(np.float64)

    # density /= len(thetas_all)
    # lam_min = float(np.median(lam_mins))
    # lam_max = float(np.median(lam_maxs))
print("Mean: {:.4f}\tStd: {:.4f}".format(np.mean(zero_eigenvalue_frequency), np.std(zero_eigenvalue_frequency)))

print(np.max(zero_eigenvalue_frequency))

In [None]:
import matplotlib.pyplot as plt
plt.style.use('default')

# Enable LaTeX rendering
# plt.rcParams['text.usetex'] = True
plt.rcParams.update({
    # Regular text fonts
    'font.family': 'serif',
    'font.serif': ['Times New Roman', 'DejaVu Serif', 'Liberation Serif'],

    # Math text fonts - this is the key part!
    'mathtext.fontset': 'stix',  # or 'stixsans', 'cm', 'dejavuserif'
    'mathtext.rm': 'serif',      # Roman (normal) math text
    'mathtext.it': 'serif:italic',  # Italic math text
    'mathtext.bf': 'serif:bold',    # Bold math text
})

alpha = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
zero_frequency = np.array([0.6484,0.5815,0.5691,0.6087,0.6262,0.6366])
zero_frequency_std = np.array([0.0119,0.0222,0.0200,0.0199,0.0141,0.0146])

# Plot density
# --- Plot the estimated spectral density ---
plt.figure(figsize=(8,5.5))
# frequency[np.log(frequency)<-20] = 5e-9
color = {"4": "tomato", "3": "tomato", "2": "royalblue", "1": "forestgreen"}
plt.scatter(alpha, zero_frequency, color="royalblue", s=200)
plt.plot(alpha, zero_frequency, color="royalblue", lw=4)
plt.fill_between(
    alpha, zero_frequency - zero_frequency_std, zero_frequency + zero_frequency_std, alpha=0.3, color="royalblue"
)

plt.yticks(np.arange(0.5, 0.71, 0.05), ["0.5", "", "0.6", "", "0.7"], fontsize=32)
plt.xticks(fontsize=32) # np.arange(-6, 6.1, 2),
plt.ylim(0.48, 0.72)
plt.xlabel(r"$\alpha$", fontsize=32)
plt.title("Probability of zero eigenvalue", fontsize=32)
# plt.legend(fontsize=26, ncol=1)
# Customized grid
plt.grid(True,
         alpha=0.7,          # Transparency
         color='gray')       # Color
plt.tight_layout()
plt.savefig(f"xx/notebooks/figures/hessian_spectrum_zero_eigenvalues.pdf")
plt.show()