In [None]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import torch
import pandas as pd
from plotly import express as px
from plotly import graph_objects as go
# Local
import sys
sys.path.append("../") # This is a terrible practice and do not use it in real code
from unsupamr.constants import DEFAULT_SEQ_MODEL
from unsupamr.utils import VocabExt


In [None]:
pretrained_mod = T5ForConditionalGeneration.from_pretrained(DEFAULT_SEQ_MODEL)
tokenizer = T5TokenizerFast.from_pretrained(DEFAULT_SEQ_MODEL)
vocab_ext = VocabExt(pretrained_mod, tokenizer)

In [None]:
df_data = {
    "Norm": [],
    "Vocab Type": [],
}
with torch.no_grad():
    old_lm_head = pretrained_mod.lm_head
    norm_vals = torch.norm(old_lm_head.weight, p=2, dim=-1)
    df_data["Norm"].extend(norm_vals.detach().cpu().tolist())
    df_data["Vocab Type"].extend(["Original"] * norm_vals.shape[0])

    expanded_head = torch.nn.Linear(old_lm_head.weight.shape[1], vocab_ext.new_vocab_size, bias=False, device=old_lm_head.weight.device)
    new_norm_vals = torch.norm(expanded_head.weight[old_lm_head.weight.shape[0]: ], p=2, dim=-1)
    df_data["Norm"].extend(new_norm_vals.detach().cpu().tolist())
    df_data["Vocab Type"].extend(["Added"] * new_norm_vals.shape[0])
df = pd.DataFrame(data=df_data)
df = df.rename(columns={"Norm": "L2 Norm"})

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(
   x=df[df['Vocab Type'] == 'Original']['L2 Norm'],
   name='Original' 
))
fig.add_trace(go.Histogram(
   x=df[df['Vocab Type'] == 'Added']['L2 Norm'],
   name='Added' 
))
fig.update_layout(
    template="simple_white",
    xaxis_title="L2 Norm",
    yaxis_title="Count",
    width=750,
    height=500,
    font=dict(size=24),
    legend=dict(
      title="Token Type",
      bgcolor="rgba(0,0,0,0)",
      x=0.70, y=1.0 
    )
)
fig