In [1]:
#!pip install ngrok -q
#!pip install dash -q
#!pip install "dash[diskcache]" -q

In [2]:
from dataclasses import dataclass
from transformers.utils import ModelOutput
from typing import List, Optional, Tuple, Union
import torch
@dataclass
class BaseModelOutputWithPastAndResiduals(ModelOutput):
    last_hidden_state: torch.FloatTensor = None
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    residuals: Optional[Tuple[torch.FloatTensor, ...]] = None

@dataclass
class CausalLMOutputWithPastAndResiduals(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    residuals: Optional[Tuple[torch.FloatTensor, ...]] = None

In [3]:
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch LLaMA model."""
import math
import warnings
from typing import List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.activations import ACT2FN
from transformers.modeling_attn_mask_utils import (
    AttentionMaskConverter,
    _prepare_4d_causal_attention_mask,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    SequenceClassifierOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.utils.doc import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from transformers.utils.import_utils import is_flash_attn_2_available
from transformers.utils import logging
from transformers.utils.import_utils import is_torch_fx_available
from transformers.models.llama.configuration_llama import LlamaConfig


#if is_flash_attn_2_available():
#    from flash_attn import flash_attn_func, flash_attn_varlen_func
#    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa


# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
# It means that the function will not be traced through and simply appear as a node in the graph.
if is_torch_fx_available():
    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "LlamaConfig"


def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = F.pad(
        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
    )
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    warnings.warn(
        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils.AttentionMaskConverter._prepare_4d_attention_mask"
    )
    return AttentionMaskConverter._prepare_4d_attention_mask(
        mask=mask, dtype=dtype, tgt_len=tgt_len
    )


def _make_causal_mask(
    input_ids_shape: torch.Size,
    dtype: torch.dtype,
    device: torch.device,
    past_key_values_length: int = 0,
):
    warnings.warn(
        "Calling `transformers.models.llama.modeling_llama._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.llama.modeling_llama.AttentionMaskConverter._make_causal_mask"
    )
    return AttentionMaskConverter._make_causal_mask(
        input_ids_shape=input_ids_shape,
        dtype=dtype,
        device=device,
        past_key_values_length=past_key_values_length,
    )


class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)


In [4]:
class LlamaAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: LlamaConfig):
        super().__init__()
        self.config = config
        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        self.q_proj = nn.Linear(
            self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            self.hidden_size,
            self.num_key_value_heads * self.head_dim,
            bias=config.attention_bias,
        )
        self.v_proj = nn.Linear(
            self.hidden_size,
            self.num_key_value_heads * self.head_dim,
            bias=config.attention_bias,
        )
        self.o_proj = nn.Linear(
            self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias
        )
        self._init_rope()

    def _init_rope(self):
        if self.config.rope_scaling is None:
            self.rotary_emb = LlamaRotaryEmbedding(
                self.head_dim,
                max_position_embeddings=self.max_position_embeddings,
                base=self.rope_theta,
            )
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
                    self.head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
                    self.head_dim,
                    max_position_embeddings=self.max_position_embeddings,
                    scaling_factor=scaling_factor,
                    base=self.rope_theta,
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]], Optional[Tuple[torch.Tensor]]]:
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

        bsz, q_len, _ = hidden_states.size()

        if self.config.pretraining_tp > 1:
            key_value_slicing = (
                self.num_key_value_heads * self.head_dim
            ) // self.config.pretraining_tp
            query_slices = self.q_proj.weight.split(
                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
            )
            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

            query_states = [
                F.linear(hidden_states, query_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            query_states = torch.cat(query_states, dim=-1)

            key_states = [
                F.linear(hidden_states, key_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            key_states = torch.cat(key_states, dim=-1)

            value_states = [
                F.linear(hidden_states, value_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            value_states = torch.cat(value_states, dim=-1)

        else:
            query_states = self.q_proj(hidden_states)
            key_states = self.k_proj(hidden_states)
            value_states = self.v_proj(hidden_states)
            

        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value[0].shape[-2]
        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
        query_states, key_states = apply_rotary_pos_emb(
            query_states, key_states, cos, sin, position_ids
        )
        
        if past_key_value is not None:
            # reuse k, v, self_attention
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)

        past_key_value = (key_states, value_states) if use_cache else None

        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
        ) / math.sqrt(self.head_dim)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.attention_dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )
        
        attn_output = attn_output.transpose(1, 2).contiguous()
        
        single_head_outputs = [attn_output[:,:,i,:].reshape(bsz, q_len, self.head_dim) for i in range(0, self.num_heads)]

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

        if self.config.pretraining_tp > 1:
            attn_output = attn_output.split(
                self.hidden_size // self.config.pretraining_tp, dim=2
            )
            o_proj_slices = self.o_proj.weight.split(
                self.hidden_size // self.config.pretraining_tp, dim=1
            )
            attn_output = sum(
                [
                    F.linear(attn_output[i], o_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ]
            )
        else:
            attn_output = self.o_proj(attn_output)
        
        # Attention heads aggregator
        single_head_outputs = [
            torch.matmul(
                attn, 
                self.o_proj.weight[:, i*self.head_dim : (i+1)*self.head_dim].transpose(0, 1)
            ) for i, attn in enumerate(single_head_outputs)
        ]
        # Sanity Check
        # assert torch.equal(attn_output.round(decimals=2), sum(single_head_outputs).round(decimals=2))
        
        cos_sim = torch.nn.CosineSimilarity(dim=-1)
        
        single_head_outputs = (
            torch.stack([cos_sim(sh_output, attn_output) for sh_output in single_head_outputs], dim=0).transpose(0, 2),      # Cosine
            torch.stack([torch.linalg.norm(sh_output, dim=-1) for sh_output in single_head_outputs], dim=0).transpose(0, 2), # L2
        )
        
        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value, single_head_outputs
    
    
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

In [5]:
class LlamaRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (
            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings,
            device=self.inv_freq.device,
            dtype=torch.get_default_dtype(),
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )


class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )
        t = t / self.scaling_factor

        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
    ):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len

        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings)
                - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (
                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)

        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )

        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class LlamaMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        if self.config.pretraining_tp > 1:
            slice = self.intermediate_size // self.config.pretraining_tp
            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
            down_proj_slices = self.down_proj.weight.split(slice, dim=1)

            gate_proj = torch.cat(
                [
                    F.linear(x, gate_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ],
                dim=-1,
            )
            up_proj = torch.cat(
                [
                    F.linear(x, up_proj_slices[i])
                    for i in range(self.config.pretraining_tp)
                ],
                dim=-1,
            )

            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
            down_proj = [
                F.linear(intermediate_states[i], down_proj_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            down_proj = sum(down_proj)
        else:
            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

        return down_proj

class LlamaFlashAttention2(LlamaAttention):
    """
    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    """

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # LlamaFlashAttention2 attention does not support output_attentions
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )

            # overwrite attention_mask with padding_mask
            attention_mask = kwargs.pop("padding_mask")

        output_attentions = False

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        # Flash attention requires the input to have the shape
        # batch_size x seq_length x head_dim x hidden_dim
        # therefore we just need to keep the original shape
        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_dim
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, q_len, self.num_key_value_heads, self.head_dim
        ).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value[0].shape[-2]

        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

        query_states, key_states = apply_rotary_pos_emb(
            query_states, key_states, cos, sin, position_ids
        )

        if past_key_value is not None:
            # reuse k, v, self_attention
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)

        past_key_value = (key_states, value_states) if use_cache else None

        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        dropout_rate = 0.0 if not self.training else self.attention_dropout

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in the correct dtype just to be sure everything works as expected.
        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
        # in fp32. (LlamaRMSNorm handles it correctly)

        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            # Handle the case where the model is quantized
            if hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        attn_output = self._flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            dropout=dropout_rate,
        )

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def _flash_attention_forward(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        query_length,
        dropout=0.0,
        softmax_scale=None,
    ):
        """
        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
        first unpad the input, then computes the attention scores and pad the final attention scores.

        Args:
            query_states (`torch.Tensor`):
                Input query states to be passed to Flash Attention API
            key_states (`torch.Tensor`):
                Input key states to be passed to Flash Attention API
            value_states (`torch.Tensor`):
                Input value states to be passed to Flash Attention API
            attention_mask (`torch.Tensor`):
                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                position of padding tokens and 1 for the position of non-padding tokens.
            dropout (`int`, *optional*):
                Attention dropout
            softmax_scale (`float`, *optional*):
                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        """
        # Contains at least one padding token in the sequence
        if attention_mask is not None:
            batch_size = query_states.shape[0]
            (
                query_states,
                key_states,
                value_states,
                indices_q,
                cu_seq_lens,
                max_seq_lens,
            ) = self._upad_input(
                query_states, key_states, value_states, attention_mask, query_length
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

            attn_output_unpad = flash_attn_varlen_func(
                query_states,
                key_states,
                value_states,
                cu_seqlens_q=cu_seqlens_q,
                cu_seqlens_k=cu_seqlens_k,
                max_seqlen_q=max_seqlen_in_batch_q,
                max_seqlen_k=max_seqlen_in_batch_k,
                dropout_p=dropout,
                softmax_scale=softmax_scale,
                causal=self.is_causal,
            )

            attn_output = pad_input(
                attn_output_unpad, indices_q, batch_size, query_length
            )
        else:
            attn_output = flash_attn_func(
                query_states,
                key_states,
                value_states,
                dropout,
                softmax_scale=softmax_scale,
                causal=self.is_causal,
            )

        return attn_output

    def _upad_input(
        self, query_layer, key_layer, value_layer, attention_mask, query_length
    ):
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

        key_layer = index_first_axis(
            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        value_layer = index_first_axis(
            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
            indices_k,
        )
        if query_length == kv_seq_len:
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
                indices_k,
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        elif query_length == 1:
            max_seqlen_in_batch_q = 1
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # There is a memcpy here, that is very bad.
            indices_q = cu_seqlens_q[:-1]
            query_layer = query_layer.squeeze(1)
        else:
            # The -q_len: slice assumes left padding.
            attention_mask = attention_mask[:, -query_length:]
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
                query_layer, attention_mask
            )

        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )

residual_contributions = []
input_contributions  = []

post_att_residual_contributions = []
post_att_input_contributions  = []


In [6]:
class LlamaDecoderLayer(nn.Module):
    def __init__(self, config: LlamaConfig):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.self_attn = (
            LlamaAttention(config=config)
            if not getattr(config, "_flash_attn_2_enabled", False)
            else LlamaFlashAttention2(config=config)
        )
        self.mlp = LlamaMLP(config)
        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = LlamaRMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        **kwargs,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        
        layer_id = kwargs.get('layer_id')
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value, single_att_heads_contrib = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            **kwargs,
        )

        attn_states = hidden_states
        residuals = residual
        
        hidden_states = residual + hidden_states
        
        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states)

        mlp_states = hidden_states

        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        outputs += (residuals,)
        outputs += (attn_states,)
        outputs += (mlp_states,)

        return outputs


LLAMA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LlamaConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
class LlamaPreTrainedModel(PreTrainedModel):
    config_class = LlamaConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["LlamaDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()




WW_debug_list = []

LLAMA_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


@add_start_docstrings(
    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
    LLAMA_START_DOCSTRING,
)
class LlamaModel(LlamaPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]

    Args:
        config: LlamaConfig
    """

    def __init__(self, config: LlamaConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]
        )
        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

        # Add debugger object to model
        self.hidden_states = []

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPastAndResiduals]:
        #global attention_heads_simlist
        global residuals
        global attn_states
        global mlp_states
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        past_key_values_length = 0
        if past_key_values is not None:
            past_key_values_length = past_key_values[0][0].shape[2]

        if position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(
                past_key_values_length,
                seq_length + past_key_values_length,
                dtype=torch.long,
                device=device,
            )
            position_ids = position_ids.unsqueeze(0)

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if getattr(self.config, "_flash_attn_2_enabled", False):
            # 2d mask is passed through the layers
            attention_mask = (
                attention_mask
                if (attention_mask is not None and 0 in attention_mask)
                else None
            )
        else:
            # 4d mask is passed through the layers
            attention_mask = _prepare_4d_causal_attention_mask(
                attention_mask,
                (batch_size, seq_length),
                inputs_embeds,
                past_key_values_length,
            )

        # embed positions
        hidden_states = inputs_embeds

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = () if use_cache else None
        layers_residuals = ()
        layers_attn_states = ()
        layers_mlp_states = ()
        #single_att_heads_contrib = [(), ()]

        for idx, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            past_key_value = (
                past_key_values[idx] if past_key_values is not None else None
            )

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    attention_mask,
                    position_ids,
                    past_key_value,
                    output_attentions,
                    use_cache,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_value,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    layer_id=idx
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

            if output_attentions:
                all_self_attns += (layer_outputs[1],)
                
            layers_residuals += (layer_outputs[-3],)
            layers_attn_states += (layer_outputs[-2],)
            layers_mlp_states += (layer_outputs[-1],)

        residuals += (layers_residuals,)
        attn_states += (layers_attn_states,)
        mlp_states += (layers_mlp_states,)
        
        # Attention heads aggregator
        # Concat results for all heads
        #for i in range(len(single_att_heads_contrib)):
        #    single_att_heads_contrib[i] = torch.cat(single_att_heads_contrib[i], dim=1)
        # Add results to all tokens view
        #if not attention_heads_simlist:
        #    attention_heads_simlist = single_att_heads_contrib
        #else:
        #    for i in range(len(attention_heads_simlist)):
        #        attention_heads_simlist[i] = torch.cat(
        #            (attention_heads_simlist[i], single_att_heads_contrib[i]), dim=0
        #        )

        hidden_states = self.norm(hidden_states)

        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        
        # Add hidden states for current generation
        self.hidden_states.append(all_hidden_states)

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )


class LlamaForCausalLM(LlamaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPastAndResiduals]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LlamaForCausalLM

        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        if self.config.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(
                self.vocab_size // self.config.pretraining_tp, dim=0
            )
            logits = [
                F.linear(hidden_states, lm_head_slices[i])
                for i in range(self.config.pretraining_tp)
            ]
            logits = torch.cat(logits, dim=-1)
        else:
            logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        **kwargs,
    ):
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]

            # Some generation methods already pass only the last input ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx.to(past_state.device))
                    for past_state in layer_past
                ),
            )
        return reordered_past

    def _apply_lm_head(self, hidden_states):
        """
        Function which takes as input the hidden states of the model and returns the prediction of the next token.
        Uses the language modeling head of output
        """
        pred_ids = []
        per_token_logits = []
        for i in range(len(hidden_states)):        
            logits = self.lm_head(hidden_states[i])
            logits = logits.float()
            pred_id = torch.argmax(logits)
            pred_ids.append(pred_id)
            probs = torch.nn.functional.softmax(logits.squeeze())
            prob =round(float(probs[pred_id]) * 100, 2)
            per_token_logits.append(prob)
        return pred_ids, per_token_logits
    
    def _apply_input_lm_head(self, hidden_states):
        """
        Function which takes as input the hidden states of the model and returns the prediction of the next token.
        Uses the language modeling head of input
        """
        pred_ids = []
        per_token_logits = []
        for layer in hidden_states:
            output = torch.matmul(layer.to(self.model.embed_tokens.weight.device), self.model.embed_tokens.weight.T)
            token_id = output.argmax(dim=2)
            probs = torch.nn.functional.softmax(output.squeeze())
            print(probs)
            prob = round(float(probs[token_id]) * 100, 2)
            pred_ids.append(token_id)
            per_token_logits.append(prob)
        return pred_ids, per_token_logits
    
    def get_hidden_states_predictions(self, embedding="output"):
        """
        Returns list of lists visualizing the predictions for each layer for each step
        """
        # Check embedding parameter value
        if embedding not in ['input', 'output']:
            raise ValueError("embdding not valid")
        
        predictions = []
        logits = []
        for n_token in range(1, len(self.model.hidden_states)):
            if embedding == 'output':
                pred_ids, per_token_logits = self._apply_lm_head(self.model.hidden_states[n_token])
            else:
                pred_ids, per_token_logits = self._apply_input_lm_head(self.model.hidden_states[n_token])
            predictions.append([int(id) for id in pred_ids])
            logits.append(per_token_logits)
        return predictions, logits

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, AutoConfig, StoppingCriteriaList, StoppingCriteria
from collections import defaultdict
from itertools import cycle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

import itertools
import torch

In [8]:
# https://github.com/oobabooga/text-generation-webui/blob/2cf711f35ec8453d8af818be631cb60447e759e2/modules/callbacks.py#L12
class _SentinelTokenStoppingCriteria(StoppingCriteria):
    def __init__(self, sentinel_token_ids: list, starting_idx: int):
        StoppingCriteria.__init__(self)
        self.sentinel_token_ids = sentinel_token_ids
        self.starting_idx = starting_idx
        self.shortest = min([x.shape[-1] for x in sentinel_token_ids])

    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
        for sample in input_ids:
            trimmed_sample = sample[self.starting_idx:]
            trimmed_len = trimmed_sample.shape[-1]
            if trimmed_len < self.shortest:
                continue

            for sentinel in self.sentinel_token_ids:
                sentinel_len = sentinel.shape[-1]
                if trimmed_len < sentinel_len:
                    continue

                window = trimmed_sample[-sentinel_len:]
                if torch.all(torch.eq(sentinel, window)):
                    return True

        return False
####

def generate_stopping_criteria(stopgen_tokens, input_len=0):
    return StoppingCriteriaList([
        _SentinelTokenStoppingCriteria(
            sentinel_token_ids = stopgen_tokens,
            starting_idx=input_len
        )
    ])


# CODE

In [9]:
#model_id = "microsoft/phi-1_5"
model_id = "meta-llama/Llama-2-7b-hf"
#model_id = "mistralai/Mistral-7B-v0.1"
customlama = True
torch.set_default_device("cpu")

In [10]:
hf_key = ""
if model_id in ["meta-llama/Llama-2-7b-hf"]:
    hf_key = input("Hugging Face Key: ")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, token=hf_key)
if customlama:
    model = LlamaForCausalLM.from_pretrained(model_id, token=hf_key)
else:
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, token=hf_key)
model_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True, token=hf_key)
del hf_key

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
if model_id in ["microsoft/phi-1_5"]:
    stopgen_tokens = [
        torch.tensor([198, 198]),  # \n\n
        torch.tensor([628])        # \n\n
    ]
    prompt_structure = "Question: {prompt}\n\nAnswer:"
    exclude_token_offset = 3
    fix_characters = [("Ġ", "␣"), ("Ċ", "\n")]
elif model_id in ["meta-llama/Llama-2-7b-hf", "mistralai/Mistral-7B-v0.1"]:
    stopgen_tokens = [
        torch.tensor([1]),  # <s>
        torch.tensor([2])   # </s>
    ]
    prompt_structure = "{prompt}"
    exclude_token_offset = None
    fix_characters = [("<0x0A>", "\n")]

fix_characters += [("\n", "\\n")]

In [12]:
if tokenizer.eos_token_id and not tokenizer.pad_token_id:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [13]:
### ALTERATION ### Divided computation for attentions
### ALTERATION ###  Added function to compute attentions also for prompt

def fix_generated_order(generated_tokens):
    n_input_tokens = generated_tokens[0][0].size()[1]
    new_tokens = [()] * n_input_tokens
    for layer_tokens in generated_tokens[0]:
        for i, layer_token in enumerate(torch.split(layer_tokens, split_size_or_sections=1, dim=1)):
            new_tokens[i] += (layer_token, )
    for layers_token in attn_states[1:]:
        new_tokens.append(layers_token)
    return new_tokens

def pad_masked_attentions(attentions, max_len):
    """
    Attention in generative models are masked, we want to plot a heatmap so we must pad all attentions to the same size with 0.0 values
    """
    array_attentions = [np.array(att) for att in attentions]
    new_attentions = [np.concatenate([att, np.zeros([max_len - len(att)])]) for att in array_attentions]
    return np.array(new_attentions)

def compute_complete_padded_attentions(generated_output, layer, head):
    single_layer_attentions = []
    # Prompt tokens
    for single_layer_single_head in torch.squeeze(torch.select(generated_output.attentions[0][layer], 1, head)):
        single_layer_attentions.append(single_layer_single_head)
    # Response tokens
    for attentions_per_token in generated_output.attentions[1:]:
        # Take single layer
        single_layer = attentions_per_token[layer]
        # Take only one head
        single_layer_single_head = torch.select(single_layer, 1, head)
        single_layer_attentions.append(single_layer_single_head)
    # Squeeze dimensions to one a one-dimensional tensor
    pure_attentions = [s.squeeze() for s in single_layer_attentions]
    max_seq_len  = len(pure_attentions[-1])
    # Print last attention heatmap
    padded_attentions = pad_masked_attentions(pure_attentions, max_seq_len)
    return padded_attentions

def compute_batch_complete_padded_attentions(generated_output, heads):
    multi_layer_head_attentions = []
    for head in heads:
        multi_layer_attentions = []
        for layer in range(0, len(generated_output.attentions[0])):
            # Prompt tokens
            prompt_att = [
                torch.squeeze(single_head)
                for single_head in torch.squeeze(torch.select(generated_output.attentions[0][layer], 1, head))
            ]
            # Response tokens
            response_att = [
                torch.squeeze(torch.select(single_layer[layer], 1, head))
                for single_layer in generated_output.attentions[1:]
            ]
            # Pad and merge attentions
            multi_layer_attentions.append(pad_masked_attentions( 
                [att_token for att_token in prompt_att + response_att],
                len(response_att[-1])
            ))
        multi_layer_head_attentions.append(multi_layer_attentions)
    return multi_layer_head_attentions

def plot_attentions(generated_output, layer, head, generated_tokens, past_tokens):
    # Plot 
    data = compute_padded_attentions(generated_output, layer, head)
    fig, ax = plt.subplots(figsize = (12,5))
    im = ax.imshow(data)
    # Show all ticks and label them with the respective list entries
    ax.set_yticks(np.arange(len(generated_tokens)), labels=generated_tokens)
    ax.set_xticks(np.arange(len(past_tokens)), labels=past_tokens, fontsize=8)
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    # Create colorbar
    cbar = ax.figure.colorbar(im, ax=ax)

    ax.set_title(f"Heatmap of attention layers: layer {layer} head {head}")
    fig.tight_layout()
    plt.show()

In [14]:
def _apply_lm_head(hidden_states, weights, bias, tot_layers=-1):
    """
    Function which takes as input the hidden states of the model and returns the prediction of the next token.
    Uses the language modeling head of input
    """
    pred_ids = []
    for n, token_layer in enumerate(hidden_states):
        token_id = compute_ids_from_embedding(token_layer, weights, bias, tot_layers=tot_layers, layer_n=n)
        pred_ids.append(token_id)
    return pred_ids
    
def embed_hidden_states(model, hidden_states, embedding="output", include_prompt=False, include_end=True, multirep=True, max_rep=10):
    end_idx = len(hidden_states) if include_end else len(hidden_states) - 1
    tot_layers = model.config.num_hidden_layers

    if embedding == 'output':
        weights = model.lm_head.weight.T
        reverse_weights = model.lm_head.weight
    elif embedding == 'input':
        weights = model.model.embed_tokens.weight.T
        reverse_weights = model.model.embed_tokens.weight
    elif embedding == 'interpolate':
        weights = {"input": model.model.embed_tokens.weight.T, "output": model.lm_head.weight.T}
        reverse_weights = {"input": model.model.embed_tokens.weight, "output": model.lm_head.weight}
    else:
        raise ValueError("Embedding not valid")

    bias = 0
    if model.lm_head.bias:
        raise ValueError("Bias not supported") 

    predictions = []
    # Prompt tokens
    if include_prompt:
        for token_states in torch.stack(hidden_states[0]).swapaxes(0, 2):
            if multirep:
                pred_ids = compute_multirep(model, token_states.swapaxes(0, 1), weights, bias, reverse_weights, max_rep=max_rep, tot_layers=tot_layers)
            else:
                pred_ids = [_apply_lm_head(token_states.swapaxes(0, 1), weights, bias, tot_layers=tot_layers)]
            predictions.append([[int(id) for id in idd] for idd in pred_ids])
    # Response tokens
    for token_states in hidden_states[1:end_idx]:
        if multirep:
            pred_ids = compute_multirep(model, token_states, weights, bias, reverse_weights, max_rep=max_rep, tot_layers=tot_layers)
        else:
            pred_ids = [_apply_lm_head(token_states, weights, bias, tot_layers=tot_layers)]
        predictions.append([[int(id) for id in idd] for idd in pred_ids])
    return predictions

def compute_ids_from_embedding(token_emb, weights, bias, tot_layers=-1, layer_n=-1):
    # Interpolated embeddings
    if type(weights) == dict:
        logits = {k:torch.matmul(token_emb, weight) + bias for k, weight in weights.items()}
        logits = ((tot_layers - layer_n) * (logits["input"]) + layer_n * (logits["output"])) / tot_layers
    # Single embeddings
    else:
        logits = torch.matmul(token_emb, weights) + bias
    return torch.argmax(logits)

def compute_multirep(model, hidden_states, weights, bias, reverse_weights, max_rep=5, tot_layers=-1, layer_n=-1):
    pred_ids = []
    #pred_norms = []
    for n, hs in enumerate(hidden_states):
        tokens = []
        norms = []
        token_emb = hs.squeeze()
        for i in range(0, max_rep):
            # Compute token and embedding norm
            token_id = compute_ids_from_embedding(token_emb, weights, bias, tot_layers=tot_layers, layer_n=n)
            norm = torch.norm(token_emb) 
            # Stop prematurely if norm is too small or if norm is bigger than previous one
            if norm <= 0.01 or (len(norms) > 0 and norm >= norms[-1]):
                break
            # Do not add repreated tokens
            if token_id not in tokens:
                tokens.append(token_id)
            norms.append(norm)
            # Compute next embedding by subtracting the closest embedding to the current embedding
            closest_emb = reverse_weights[token_id]
            token_emb = token_emb - closest_emb
        pred_ids.append(tokens)
        #pred_norms.append(norms)
    return pred_ids#, pred_norms

def test_multirep(model, input, embedding, token=1):
    if embedding == 'output':
        weights = model.lm_head.weight.T
    elif embedding == 'input':
        weights = model.model.embed_tokens.weight.T

    bias = model.lm_head.bias
    if bias:
        reverse_weights = torch.add(weights.T, bias.unsqueeze(dim=1))
    else:
        bias = 0
        reverse_weights = weights.T 
    inputs = tokenizer("Hi, how are you", return_tensors="pt")
    gen_config = GenerationConfig(
        pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else None,
        output_attentions=True, output_hidden_states=True, return_dict_in_generate=True
    )
    gen_output = model.generate(inputs.input_ids, generation_config=gen_config, max_new_tokens=5)
    print(tokenizer.decode(gen_output.sequences.squeeze()))
    a,aa = compute_multirep(model, gen_output.hidden_states[1], weights, bias, reverse_weights)
    return [[(tokenizer.decode(c), cc.detach().numpy().tolist()) for c,cc in zip(b,bb)] for b,bb in zip(a,aa)]


In [15]:
def fix_dataframe_characters(df, replacements, multirep=False, columns=False):
    for old, new in replacements:
        df = df.map(lambda x: [i.replace(old, new) for i in x] if multirep else x.replace(old, new))
    if columns:
        for old, new in replacements:
            df.columns = df.columns.str.replace(old, new)
    return df

In [16]:
def compute_edge_precache(nodes, edges, pedges, heads):
    return {
        (x,y):  { 
            head: {
                "edges": traces,
                "color_nodes": {
                    emb: compute_color_nodes(emb_nodes, traces + pedges[head], x, y) 
                    for emb, emb_nodes in nodes.items()
                }
            } for head in heads if (traces := compute_add_traces(edges[head], x, y)) or True
        }
        for x,y in [(x,y) for node_coords in [zip(node_batch.x, node_batch.y) for node_batch in list(nodes.values())[0]] for x,y in node_coords]
    }

def access_edge_cache(cache, x, y, attention_head, embedding, color_scale):
    add_traces = cache[(x,y)][attention_head]["edges"]
    add_color_nodes = cache[(x,y)][attention_head]["color_nodes"][embedding]
    add_color_nodes = [apply_color(node, trace, color_scale) for node, trace in add_color_nodes]
    return add_traces, add_color_nodes

def compute_add_traces(edges, x, y):
    return [
        el for el in edges if (
            el.customdata[0]["P1"]["x"] == x and el.customdata[0]["P1"]["y"] == y
        ) or (
            el.customdata[0]["P2"]["x"] == x and el.customdata[0]["P2"]["y"] == y
        )
    ]

def compute_color_nodes(nodes, add_traces, x, y):
    return [
        (node, trace)
        for trace in add_traces for node in filter(lambda n: n.y[0] == y - 1 and n.x[0] <= x, nodes) 
        if trace.customdata[0]["P1"]["x"] in node.x and trace.customdata[0]["P1"]["y"] in node.y and
            trace.customdata[0]["P2"]["x"] == x and trace.customdata[0]["P2"]["y"] == y
    ]

def apply_color(node, trace, color_scale):
    node["marker"]["color"] = pc.sample_colorscale(color_scale, float(trace.text))
    return node

In [17]:
def model_generate(model, tokenizer, prompt, max_extra_length, config, min_stop_length, stopping_tokens):
    inputs = tokenizer(prompt, return_tensors="pt")
    input_len = len(inputs.input_ids.squeeze().tolist())
    max_len = input_len + max_extra_length
    
    gen_config = config
    stopping_criteria = generate_stopping_criteria(stopping_tokens, input_len + min_stop_length)
    
    generated_output = model.generate(inputs.input_ids, generation_config=gen_config, max_length=max_len, stopping_criteria=stopping_criteria)
    outputs = generated_output.sequences.squeeze()
    text_output = tokenizer.decode(generated_output.sequences.squeeze()[input_len:])
    
    all_tokens = tokenizer.convert_ids_to_tokens(generated_output.sequences[0])
    input_tokens = all_tokens[0:input_len]
    generated_tokens = all_tokens[input_len:]
    
    return text_output, generated_output, {"in": input_tokens, "gen": generated_tokens}

def create_hidden_states_df(model, tokenizer, generated_output, gen_tokens, embedding, include_prompt, fix_characters, multirep=False):
    predictions = embed_hidden_states(model, generated_output.hidden_states, embedding, include_prompt=include_prompt, multirep=multirep, max_rep=5)
    rows = [[tokenizer.convert_ids_to_tokens(pred) for pred in pred_list] for pred_list in predictions]
    rows = rows if multirep else np.squeeze(rows)
    if embedding == "input":
        cols = gen_tokens["in"] + gen_tokens["gen"][:-1]
    else:
        cols = gen_tokens["in"][1:] + gen_tokens["gen"]
    df = pd.DataFrame(rows).T.sort_index(ascending=False).rename(columns={n: col for n, col in enumerate(cols)})
    df = fix_dataframe_characters(df, fix_characters, multirep=multirep, columns=True)
    return df

def create_attention_visualization(dfs, generated_output, exclude, heads, max_heads, compute_precache=True, multirep=False):
    figure, nodes, edges = create_transformer_plot(dfs, generated_output, exclude, heads=heads, max_heads=max_heads, multirep=multirep)
    edges, permanent_edges = edges
    permanent_traces = {head: {"edges": permanent_edges[head]} for head in heads}
    edge_precache = compute_edge_precache(nodes, edges, permanent_edges, heads) if compute_precache else None
    return figure, {"nodes": nodes, "edges": edges, "perm": permanent_traces}, edge_precache

In [123]:
def cumulative_attention_traces_separate(
    df, attentions,         # Dataframe and attentions to access labels and attention weights
    row, index, el_index,   # Dataframe is indexed by index and row, while el_index references the index for sankey visualization
    base,                   # Base attention value of parent
    labels,                 # Current set of labels for sankey visualization
    n_el,                   # Current number of nodes inside cumulative sankey labels list
    rowlimit,               # Depth limit
):
    new_labels = []
    label_indexes = []
    val = []
    # Iterate over all elements of the next row
    for i, label in enumerate(df.iloc[row+1].tolist()):
        # Calculate current value of node by weighting its attention value for the parent's weight
        v = base * attentions[row][index][i]
        if v > 0:
            new_labels.append(label)
            label_indexes.append(i)
            val.append(v)
    # Generate lists for sankey traces, composed by source/target nodes and respective values
    under = [i for i in range(n_el, n_el + len(new_labels))]
    over = [el_index] * len(new_labels)
    # TODO: copy necessary?     
    new_under = under.copy()
    new_val = val.copy()
    # If depth limit is reached, stop recurring
    if row < rowlimit:
        # Call itself on all the new nodes (NOTE: zip only iterates over the smallest list size)
        for u,v,idx in zip(under, val, label_indexes):
            nex_under, nex_over, nex_val, nex_labels = cumulative_attention_traces_separate(df, attentions, row+1, idx, u, v, new_labels, n_el + len(new_labels), rowlimit)
            # Update sankey trace lists and sankey labels list with children's results
            new_under += nex_under
            over += nex_over
            new_val += nex_val
            new_labels += nex_labels
    # Only executed at topmost level
    if el_index == 0:
        # Complete sankey labels list with starting label
        new_labels = labels + new_labels
        
    return new_under, over, new_val, new_labels

def propagate_carry(elmap, carry, anchors):
    for k,v in elmap.items():
        if v["anchor"][0] in anchors:
            elmap[k] |= {"carry": v["carry"] + carry}
            # If the current element has children, propagates carry to them
            if len(v["anchor"]) > 1:
                # Recur while removing joint set of anchors to avoid duplicating carries
                propagate_carry(elmap, carry, [an for an in v["anchor"] if an not in anchors])
    return elmap

def cumulative_attention_traces_join(
    df, attentions,         # Dataframe and attentions to access labels and attention weights
    row, index, el_index,   # Dataframe is indexed by index and row, while el_index references the index for sankey visualization
    base,                   # Base attention value of parent
    labels,                 # Current set of labels for sankey visualization
    elmap,                  # Reference for duplicate nodes as a dictionary indexed with (row, index) and containing a dictionary composed of
                            #  a list of anchors (sankey indexes for the node itself and its immediate children) and a float representing its carry
    rowlimit,               # Depth limit
):
    new_labels = []
    new_label_indexes = []
    reused_labels = []
    reused_values = []
    val = []
    new_elmap = elmap.copy() # TODO: copy necessary?
    # Iterate over all elements of the next row
    for i, label in enumerate(df.iloc[row+1].tolist()):
        # Calculate current value of node by weighting its attention value for the parent's weight
        v = base * attentions[row][index][i]
        if v > 0:
            # If node is already present store its information and propagate the current value to the node and its children
            if (row+1, i) in elmap:
                anchors = elmap[(row+1, i)]["anchor"]
                new_elmap = propagate_carry(new_elmap, v, anchors)
                reused_labels.append(anchors[0])
                reused_values.append(v)
            # If the node is new create a new entry in the element map with a new sankey index and 0 carry
            else:
                new_elmap[(row+1, i)] = {"anchor": [len(new_elmap.keys())], "carry": 0}
                new_labels.append(label)
                new_label_indexes.append(i)
                val.append(v)
    # Generate lists for sankey traces, composed by source/target nodes and respective values
    under = [i for i in range(len(elmap.keys()), len(new_elmap.keys()))] + reused_labels
    over = [el_index] * (len(new_labels) + len(reused_labels))
    val = val + reused_values
    # Update element map by adding children relations to the parent node
    for k,v in new_elmap.items():
        if v["anchor"][0] == el_index:
            v["anchor"].extend(under)
    # TODO: copy necessary?       
    new_under = under.copy()
    new_val = val.copy()
    # If depth limit is reached, stop recurring
    if row < rowlimit:
        # Call itself on all the new nodes (NOTE: zip only iterates over the smallest list size)
        for u,v,idx in zip(under, val, new_label_indexes):
            nex_under, nex_over, nex_val, nex_labels, new_elmap = cumulative_attention_traces_join(df, attentions, row+1, idx, u, v, new_labels, new_elmap, rowlimit)
            # Update elements map, sankey trace lists and sankey labels list with children's results
            new_elmap = new_elmap
            new_under += nex_under
            over += nex_over
            new_val += nex_val
            new_labels += nex_labels
    # Only executed at topmost level
    if el_index == 0:
        # Complete sankey labels list with starting label
        new_labels = labels + new_labels
        # Compute carries from reused labels
        np_over = np.array(over)
        np_under = np.array(new_under)
        for el in dict(sorted(new_elmap.items(), key=lambda d: d[0])).values():
            el_idx = el["anchor"][0]
            carry = el["carry"]
            if carry > 0:
                carry_idxs = np.where(np_over == el_idx)[0]
                agg_idxs = np.where(np_under == el_idx)[0]
                if carry_idxs.any():
                    n = sum([new_val[agg_idx] for agg_idx in agg_idxs])
                    m = sum([new_val[carry_idx] for carry_idx in carry_idxs])
                    for carry_idx in carry_idxs:
                        new_val[carry_idx] = (new_val[carry_idx] * n) / m
                    
    return new_under, over, new_val, new_labels, new_elmap

def cumulative_attention_traces_join_bfs(
    df, linkinfo,             # Dataframe and link info to access labels and node hidden information
    row, indexes, el_indexes, # Dataframe is indexed by index and row, while el_index references the index for sankey visualization
    bases,                    # Base attention value of parents
    labels,                   # Current set of labels for sankey visualization
    elmap,                    # Reference for duplicate nodes as a dictionary indexed with (row, index) and containing a dictionary composed of
                              #  an id and a base
    rowlimit,                 # Depth limit
):
    new_labels = []
    new_indexes = []
    new_elmap = elmap.copy() # TODO: copy necessary?

    under = []
    over = []
    val = []
    types = []
    # Calculate current value of node by weighting its attention value for the parent's weight
    for index, el_index, base in zip(indexes, el_indexes, bases):
        res = linkinfo["attn_residuals"][index][-(row + 1)]
        # Iterate over all elements of the next row
        for i, label in enumerate(df.iloc[row+1].tolist()):
            v = (base * (1 - res)) * linkinfo["attentions"][row][index][i]
            if v > 0 or index == i:
                over.append(el_index)
                # If node is already present store its information
                if (row+1, i) in new_elmap:
                    under.append(new_elmap[(row+1, i)]["id"])
                    new_elmap[(row+1, i)]["base"] += v
                # If the node is new create a new entry in the element map with a new sankey index 
                else:
                    new_index = len(new_elmap.keys())
                    new_labels.append(label)
                    new_indexes.append(i)
                    under.append(new_index)
                    new_elmap[(row+1, i)] = {"id": new_index, "base": v.item()}
                val.append(v)
                types.append("attention")
            # Residuals
            if index == i:
                res_v = base * res
                over.append(el_index)
                under.append(new_elmap[(row+1, i)]["id"])
                val.append(res_v)
                types.append("attention_residual")
                new_elmap[(row+1, i)]["base"] += res_v
    # If depth limit is reached, stop recurring
    if row < rowlimit:
        # Call itself on all the new nodes
        nex_under, nex_over, nex_val, nex_types, nex_labels, new_elmap = cumulative_attention_traces_join_bfs(
            df, linkinfo,
            row+1, new_indexes, [new_elmap[(row+1, i)]["id"] for i in new_indexes],
            [new_elmap[(row+1, i)]["base"] for i in new_indexes],
            new_labels,
            new_elmap,
            rowlimit
        )
        # Update elements map, sankey trace lists and sankey labels list with children's results
        new_labels += nex_labels
        under += nex_under
        over += nex_over
        val += nex_val
        types += nex_types
    # Only executed at topmost level
    if len(el_indexes) == 1 and el_indexes[0] == 0:
        # Complete sankey labels list with starting label
        new_labels = labels + new_labels
    return under, over, val, types, new_labels, new_elmap


def cumulative_attention_traces_join_mlp_bfs(
    df, linkinfo,             # Dataframe and link info to access labels and node hidden information
    row, indexes, el_indexes, # Dataframe is indexed by index and row, while el_index references the index for sankey visualization
    bases,                    # Base attention value of parents
    labels,                   # Current set of labels for sankey visualization
    elmap,                    # Reference for duplicate nodes as a dictionary indexed with (row, index) and containing a dictionary composed of
                              #  an id and a base
    rowlimit,                 # Depth limit
):
    new_labels = []
    new_indexes = []
    new_elmap = elmap.copy() # TODO: copy necessary?

    under = []
    over = []
    val = []
    types = []
    # Calculate current value of node by weighting its attention value for the parent's weight
    for index, el_index, base in zip(indexes, el_indexes, bases):
        attn_res = linkinfo["attn_residuals"][index][-(row + 1)]
        # Create MLP node
        mlp_index = len(new_elmap.keys())
        new_labels.append([""])
        new_elmap[(row + 1 - 0.75, index)] = {"id": mlp_index, "base": base}
        # Iterate over all elements of the next row
        for i, label in enumerate(df.iloc[row+1].tolist()):
            if i != 0:
                v = (base * (1 - attn_res)) * linkinfo["attentions"][row][index][i]
                if v > 0 or index == i:
                    over.append(mlp_index)
                    # If node is already present store its information
                    if (row+1, i) in new_elmap:
                        under.append(new_elmap[(row+1, i)]["id"])
                        new_elmap[(row+1, i)]["base"] += v
                    # If the node is new create a new entry in the element map with a new sankey index 
                    else:
                        new_index = len(new_elmap.keys())
                        new_labels.append(label)
                        new_indexes.append(i)
                        under.append(new_index)
                        new_elmap[(row+1, i)] = {"id": new_index, "base": v.item()}
                    val.append(v)
                    types.append("attention")
                # Residuals
                if index == i:
                    mlp_res = linkinfo["mlp_residuals"][index][-(row + 1)]
                    # MLP State
                    mlp_state_v = base * ( 1 - mlp_res )
                    over.append(el_index)
                    under.append(mlp_index)
                    val.append(mlp_state_v)
                    types.append("mlp")
                    # MLP Residual
                    mlp_res_v = base * mlp_res
                    over.append(el_index)
                    under.append(mlp_index)
                    val.append(mlp_res_v)
                    types.append("mlp_residual")
                    # Attention Residual
                    attn_res_v = base * attn_res
                    over.append(mlp_index)
                    under.append(new_elmap[(row+1, i)]["id"])
                    val.append(attn_res_v)
                    types.append("attention_residual")
                    new_elmap[(row+1, i)]["base"] += attn_res_v
    # If depth limit is reached, stop recurring
    if row < rowlimit:
        # Call itself on all the new nodes
        nex_under, nex_over, nex_val, nex_types, nex_labels, new_elmap = cumulative_attention_traces_join_mlp_bfs(
            df, linkinfo,
            row+1, new_indexes, [new_elmap[(row+1, i)]["id"] for i in new_indexes],
            [new_elmap[(row+1, i)]["base"] for i in new_indexes],
            new_labels,
            new_elmap,
            rowlimit
        )
        # Update elements map, sankey trace lists and sankey labels list with children's results
        new_labels += nex_labels
        under += nex_under
        over += nex_over
        val += nex_val
        types += nex_types
    # Only executed at topmost level
    if len(el_indexes) == 1 and el_indexes[0] == 0:
        # Complete sankey labels list with starting label
        new_labels = labels + new_labels
    return under, over, val, types, new_labels, new_elmap

def cumulative_attention_traces_join_nodes(
    df, linkinfo,             # Dataframe and link info to access labels and node hidden information
    row, indexes, el_indexes, # Dataframe is indexed by index and row, while el_index references the index for sankey visualization
    bases,                    # Base attention value of parents
    labels,                   # Current set of labels for sankey visualization
    elmap,                    # Reference for duplicate nodes as a dictionary indexed with (row, index) and containing a dictionary composed of
                              #  an id and a base
    rowlimit,                 # Depth limit
):
    new_labels = []
    new_indexes = []
    new_elmap = elmap.copy() # TODO: copy necessary?

    under = []
    over = []
    val = []
    types = []
    # Calculate current value of node by weighting its attention value for the parent's weight
    for index, el_index, base in zip(indexes, el_indexes, bases):
        res_w = linkinfo["residuals"][index][-(row + 1)].item()
        attn_w = linkinfo["attn_states"][index][-(row + 1)].item()
        mlp_w = linkinfo["mlp_states"][index][-(row + 1)].item()
        # Create MLP / Attention / Intermediate nodes
        mlp_index = len(new_elmap.keys())
        new_labels.append(["%##%FFNN"])
        new_elmap[(row + 1 - 0.8, index - 0.3)] = {"id": mlp_index, "base": base * mlp_w, "base_pow": pow(base * mlp_w, 0.5)}
        attn_index = len(new_elmap.keys())
        new_labels.append(["%##%Attention"])
        new_elmap[(row + 1 - 0.45, index - 0.3)] = {"id": attn_index, "base": base * attn_w, "base_pow": pow(base * attn_w, 0.5)}
        hid_index = len(new_elmap.keys())
        new_labels.append(["%##% "])
        new_elmap[(row + 1 - 0.65, index)] = {"id": hid_index, "base": base, "base_pow": pow(base, 0.5) }
        # Iterate over all elements of the next row
        for i, label in enumerate(df.iloc[row+1].tolist()):
            v = base * attn_w * linkinfo["attentions"][row][index][i].item()
            if v > 0:
                over.append(attn_index)
                # If node is already present store its information
                if (row+1, i) in new_elmap:
                    under.append(new_elmap[(row+1, i)]["id"])
                    new_elmap[(row+1, i)]["base"] += v
                    new_elmap[(row+1, i)]["base_pow"] += pow(v, 0.5)
                # If the node is new create a new entry in the element map with a new sankey index 
                else:
                    new_index = len(new_elmap.keys())
                    new_labels.append(label)
                    new_indexes.append(i)
                    under.append(new_index)
                    new_elmap[(row+1, i)] = {"id": new_index, "base": v, "base_pow": pow(v, 0.5)}
                val.append(v)
                types.append("attention")
        # MLP State
        over.append(el_index)
        under.append(mlp_index)
        val.append(base * mlp_w)
        types.append("mlp")
        over.append(mlp_index)
        under.append(hid_index)
        val.append(base * mlp_w)
        types.append("mlp")
        # Attention State
        over.append(hid_index)
        under.append(attn_index)
        val.append(base * attn_w)
        types.append("att")
        # Residuals
        over.append(hid_index)
        under.append(new_elmap[(row+1, index)]["id"])
        val.append(base * (res_w + mlp_w))
        types.append("residual")
        new_elmap[(row+1, index)]["base"] += base * (res_w + mlp_w)
        new_elmap[(row+1, index)]["base_pow"] = pow(base * (res_w + mlp_w), 0.5)
        over.append(el_index)
        under.append(hid_index)
        val.append(base * (attn_w + res_w))
        types.append("residual")
        
    # If depth limit is reached, stop recurring
    if row < rowlimit:
        # Call itself on all the new nodes
        nex_under, nex_over, nex_val, nex_types, nex_labels, new_elmap = cumulative_attention_traces_join_nodes(
            df, linkinfo,
            row+1, new_indexes, [new_elmap[(row+1, i)]["id"] for i in new_indexes],
            [new_elmap[(row+1, i)]["base"] for i in new_indexes],
            new_labels,
            new_elmap,
            rowlimit
        )
        # Update elements map, sankey trace lists and sankey labels list with children's results
        new_labels += nex_labels
        under += nex_under
        over += nex_over
        val += nex_val
        types += nex_types
    # Only executed at topmost level
    if len(el_indexes) == 1 and el_indexes[0] == 0:
        # Complete sankey labels list with starting label
        new_labels = labels + new_labels
    return under, over, val, types, new_labels, new_elmap

In [19]:
residuals = ()
attn_states = ()
mlp_states = ()
prompt = "Beauty is in the eye of the"
gen_config = GenerationConfig(
    pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else None,
    output_attentions=True, output_hidden_states=True, return_dict_in_generate=True
)
text_output, generated_output, gen_tokens = model_generate(
    model, tokenizer, prompt, 
    max_extra_length=10, 
    config=gen_config, 
    min_stop_length=1, stopping_tokens=stopgen_tokens
)   
dfs = {}
for emb in ["input", "output", "interpolate"]:
    dfs[emb] = create_hidden_states_df(
        model, tokenizer, generated_output, gen_tokens, emb, 
        include_prompt=True, fix_characters=fix_characters,
        multirep=False,
    )
attentions = compute_batch_complete_padded_attentions(generated_output, range(0, 32))
#attn_res_contrib = fix_generated_order(attn_residuals)
res_contrib = fix_generated_order(residuals)
attn_contrib = fix_generated_order(attn_states)
mlp_contrib = fix_generated_order(mlp_states)
res_percent = []
attn_percent = []
mlp_percent = []
#mlp_res_percent = []
#for att_res_token, att_token, mlp_token in zip(attn_res_contrib, attn_contrib, mlp_contrib):
#    att_tokenlayer_list = []
#    mlp_tokenlayer_list = []
#    for att_res_tokenlayer, att_tokenlayer, mlp_tokenlayer in zip(att_res_token, att_token, mlp_token):
#        att_tokenlayer_list.append(att_res_tokenlayer.norm() / (att_res_tokenlayer.norm() + att_tokenlayer.norm()))
#         mlp_tokenlayer_list.append( 1 - (mlp_tokenlayer.norm() / ((att_res_tokenlayer + att_tokenlayer).norm() + mlp_tokenlayer.norm())) )
#    attn_res_percent.append(att_tokenlayer_list)
#     mlp_res_percent.append(mlp_tokenlayer_list)
#    attn_contrib = fix_generated_order(attn_states)
for res_token, att_token, mlp_token in zip(res_contrib, attn_contrib, mlp_contrib):
    res_tokenlayer_list = []
    att_tokenlayer_list = []
    mlp_tokenlayer_list = []
    for res_tokenlayer, att_tokenlayer, mlp_tokenlayer in zip(res_token, att_token, mlp_token):
        den = res_tokenlayer.norm() + att_tokenlayer.norm() + mlp_tokenlayer.norm()
        res_tokenlayer_list.append(res_tokenlayer.norm() / den)
        att_tokenlayer_list.append(att_tokenlayer.norm() / den)
        mlp_tokenlayer_list.append(mlp_tokenlayer.norm() / den)
    res_percent.append(res_tokenlayer_list)
    attn_percent.append(att_tokenlayer_list)
    mlp_percent.append(mlp_tokenlayer_list)

In [239]:
### PARAMETERS
# DATA
df = dfs["interpolate"] # Dataframe to take labels from
row_index = 0 # Row of starting token (where 0 corresponds to the top row, and n_layers - 1 corresponds to the bottom row)
token_index = 9 # Position index of starting token (where 0 is first token of the input sequence)
rowlimit = 5 # Limit number of layers to visualize
multirep = False # Accomodate for each token having multiple labels
label_marker = "%##%" # Marker for placeholder labels (should not be contained in actual tokens)
# COLORS
colormap = cycle(["#FF6692"]) # Colors -- colormap = cycle(px.colors.qualitative.Plotly)
#colormap = cycle(px.colors.qualitative.Plotly)
color_change_count_threshold = 4 # Number of virtual rows that should have the same color associated to them # TODO fix second row colors
color_brightness_range = (-0.5, 0.2) # Brightness range for tokens color gradient
node_opacity = 0.7 # Opacity of nodes
link_opacity = 0.4 # Opacity of links
non_residual_link_color = (100, 100, 100) # Default color for non-resiudal links
default_node_color = (220, 220, 220) # Default color for nodes
color_nodes = False # If set to true, color nodes based on the colormap, otherwise all nodes will have their default color
# LAYOUT
sankey_zero = 0.000000000000001 # Correction to avoid feeding nodes with a coordinate value of 0, which causes problems with Plotly Sankey Diagrams
size = 1200 # Size of square canvas

### DERIVED PARAMETERS
att = attentions[-1] # Attentions
linkinfo = {"attentions": att, "residuals": res_percent, "attn_states": attn_percent, "mlp_states": mlp_percent} # Aggregated intermediate weights information
token_label = df.iloc[row_index].iloc[token_index] # Label information of the initial token

# Rescales values of a list inside a given range, if invert is set to True, the range is flipped
def rescale_list(l, range_min=0, range_max=1, old_min=None, old_max=None, invert=False):
    if old_max == None:
        old_max = max(l)
    if old_min == None:
        old_min = min(l)
    old_range = old_max - old_min
    new_range = range_max - range_min

    invert_k = 0
    invert_a = 1
    if invert:
        invert_k = old_max
        invert_a = -1

    return [ range_min + (((invert_k + (invert_a * (el - old_min))) * new_range ) / old_range) for el in l ]

# Given a list and a list of indexes that have been previously sorted, restore the original order of the list
def restore_list_order(l, indexes):
    return [l[indexes.index(i)] for i in range(0, len(indexes))]

# Return a list of RGBA color strings given a list of RGBA colors tuples
def build_rgba_from_tuples(l, opacity=1.0):
    return [f"rgba{el + (opacity,)}" if len(el) == 3 else f"rgba{el}" for el in l]

# Generate diagram data
un, ov, vl, types, lab, elmap = cumulative_attention_traces_join_nodes(
    df, linkinfo, 
    row_index, [token_index], [0], 
    [1.0], 
    [token_label], 
    {(row_index, token_index): {"id": 0, "base": 1.0, "base_pow": 1}},
    rowlimit
)

# Handle multiple labels for tokens with multiple representations
multirefs = []
if multirep:
    multirefs = lab
    lab = [l[0] for l in lab]
else:
    lab = [np.squeeze(l).item() for l in lab]
# Generate numbered labels
lab = [f"{k[1]} {lab[v['id']]}" if not lab[v['id']].startswith(label_marker) else lab[v['id']].split(label_marker)[-1] for k,v in elmap.items()]

#elmap = {k:el | {"base": pow(el["base"], 0.5)} for k,el in elmap.items()}
base_pows = [el["base_pow"] for el in elmap.values()]
print(elmap[(2,0)])
#elmap = {k:el | {"base": rescale_list([el["base_pow"]], range_min=sankey_zero, range_max=1.0,  old_min=min(base_pows), old_max=max(base_pows))[0]} for k,el in elmap.items()}
#elmap = {k:el | {"base": el["base_pow"]} for k,el in elmap.items()}
elmap = {k:el | {"base": el["base"]/3} for k,el in elmap.items()}
print(elmap[(2,0)])
vl = [el/3 for el in vl]

# Create reverse mapping obtaining lists indexed by the node id and containing virtual coordinates and node values
revmap = [next(k for k,v in elmap.items() if v["id"] == i) for i in range(len(elmap.keys()))]
revmap_values = [next(v["base"] for k,v in elmap.items() if v["id"] == i) for i in range(len(elmap.keys()))]
revmap_x = [key[0] for key in revmap]
revmap_y = [key[1] for key in revmap]
# Sort reverse-mapped lists to perform transformations on them with more ease, while keeping an index list to reverse the sorting
revmap_indexes = [i for i in range(0,len(revmap))]
revmap_x_sort, revmap_y_sort, revmap_values_sort, revmap_indexes = zip(*sorted(zip(revmap_x, revmap_y, revmap_values, revmap_indexes), key=lambda x: x[0]))

### Build colors
node_colors = []
link_colors = []
current_color = next(colormap)
old_x = -1
change_count = 0
# Node colors
for x, y in zip(revmap_x_sort, rescale_list(revmap_y_sort, range_min=color_brightness_range[0], range_max=color_brightness_range[1])):
    # Color switching
    if x != old_x:
        if change_count > color_change_count_threshold:
            current_color = next(colormap)
            change_count = 0
        change_count += 1
    # Compute the new color by summing a delta given by the position of the token inside the sequence, appropriately rescaled inside the color_brightness_range
    delta_color = tuple([int((channel) * y) for channel in px.colors.hex_to_rgb(current_color)])
    color = tuple([sum(channel) for channel in zip(px.colors.hex_to_rgb(current_color), delta_color)])
    node_colors.append(color)
    old_x = x
node_colors = restore_list_order(node_colors, revmap_indexes)
# Link colors
link_colors = [node_colors[el] if typ in ["residual"] else non_residual_link_color for typ, el in zip(types, un)]
# Remove colors from nodes if selected
if not color_nodes:
    node_colors = [default_node_color for _ in node_colors]
# Convert colors and add opacities
node_colors = build_rgba_from_tuples(node_colors, node_opacity)
link_colors = build_rgba_from_tuples(link_colors, link_opacity)

# Compute columns width
col_pad = 0#0.001 / 2
columns_width = [max([v if y == y_index else 0 for (y, v) in zip(revmap_y_sort, revmap_values_sort)]) for y_index in range(0, 10)]
s = sum(columns_width) + col_pad * len(columns_width)
columns_width = [w/s + col_pad for w in columns_width]
print(columns_width)
columns_ys = []
tot_w = 0
for w in columns_width:
    columns_ys.append(tot_w)
    tot_w += w
print(columns_ys)

fixed_padding = 0#0.03
fixed_offset = 0.01
align = "left"
### Adjust coordinates 
revmap_x = rescale_list(revmap_x, range_min=sankey_zero, range_max=1, invert=False)
revmap_y = rescale_list(revmap_y, range_min=sankey_zero, range_max=1)
revmap_y_sort_scaled = rescale_list(revmap_y_sort, range_min=sankey_zero, range_max=1)
new_revmap_y = []
old_x = -1
old_y = sankey_zero
layers = cycle(["Node", "FFNN", "Hidden", "Attention"])
offsets = {"Node": 4, "FFNN": 1, "Hidden": 2, "Attention": 3}
cur_layer = None
counter = 0
prev_layer_n = 0
align_flag = True
for i, (x, y, y_scaled, v) in enumerate(zip(revmap_x_sort, revmap_y_sort, revmap_y_sort_scaled, revmap_values_sort)):
    # Switch row
    if x != old_x:
        next_layer = next(layers)
        # Keep track of the size of the previous batch of nodes
        if cur_layer == "Node" and next_layer != "Node":
            prev_layer_n = counter
            # If the previous batch of nodes wasn't the single node representing the starting token and
            #  some kind of alignment is specified, then start aligning token nodes to the previous ones
            if prev_layer_n > 1 and align != "none":
                align_flag = False
        old_y = sankey_zero
        cur_layer = next_layer
        counter = 0

    # If the current node is a token node, and there is no current alignment, give it an incremental y coordinate 
    #  considering its size given by its value
    if cur_layer in ["Node"] and align_flag:
        new_y = old_y + v/2 + fixed_padding
        old_y = old_y + v + fixed_padding
    # If the current node needs to be aligned, align it considering its reference node
    else:
        # Compute the index of the reference node and the new y coordinate considering the sizes of both the reference and the current node
        idx = i - (prev_layer_n * offsets[cur_layer]) 
        new_y = new_revmap_y[idx] - revmap_values_sort[idx]/2 + v/2

        # Apply a fixed offset for FFNN and Attention nodes
        if cur_layer in ["FFNN", "Attention"]:
            new_y -= fixed_offset
        elif cur_layer in ["Node"]:
            if align == "right": 
                new_y = new_revmap_y[idx] + revmap_values_sort[idx]/2 - v/2
            elif align == "left":
                pass
            elif align == "center":
                new_y = new_revmap_y[idx]
    if type(y) is not float:
        new_y = columns_ys[y] + v/2
    #elif y == 9:
    #    print(cur_layer)
    #    new_y = 1.0 + v/2
    #elif y == 0:
    #    new_y = sankey_zero - v/2
    new_revmap_y.append(new_y) 

    old_x = x
    counter += 1

revmap_y = restore_list_order(new_revmap_y, revmap_indexes)
#revmap_y = rescale_list(revmap_y, range_min=sankey_zero, range_max=1)

fig = go.Figure(go.Sankey(
    orientation = "v",
    arrangement="fixed",
    valueformat=".5r",
    node=dict(
        customdata=multirefs,
        hovertemplate='%{customdata} - Value: %{value}<extra></extra>',
        align="left",
        label=lab,
        color=node_colors,
        x=revmap_x,
        y=revmap_y,
        pad=20,
    ),
    link=dict(
        customdata=types,
        hovertemplate='%{customdata} from %{source.label} to %{target.label} <extra>%{value}</extra>',
        source=ov,
        target=un,
        value=vl,
        color=link_colors
    )
))
fig.update_layout(
    font_size=12, font_family="Verdana", font_color="black",
    width=size, height=size,
    #margin={'t':200, 'r': 0}
    #margin_pad = 1000,
    #margin_b = -800,
    #autosize=False,
    #margin_autoexpand=False
)
fig.show()

{'id': 17, 'base': 0.05889354925638619, 'base_pow': 0.5096024020615374}
{'id': 17, 'base': 0.019631183085462064, 'base_pow': 0.5096024020615374}
[0.2752416995181582, 0.013838925615117507, 0.021714781454234482, 0.030921507009138816, 0.041135694431500794, 0.028712847763758296, 0.019638867786255353, 0.04477276092976474, 0.022429790973947244, 0.5015931245181245]
[0, 0.2752416995181582, 0.2890806251332757, 0.3107954065875102, 0.341716913596649, 0.3828526080281498, 0.4115654557919081, 0.43120432357816346, 0.4759770845079282, 0.49840687548187546]


In [None]:
un, ov, vl, lab, elmap = cumulative_attention_traces_join(
    df, attentions[-1], 
    row_index, token_index, 0, 
    1.0, 
    [token_label], 
    {(row_index, token_index): {"anchor":[0], "carry":0}},
    rowlimit
)

if multirep:
    multirefs = lab
    lab = [l[0] for l in lab]
if elmap:
    lab = [f"{k[1]} {lab[v['anchor'][0]]}" for k,v in elmap.items()]
else:
    lab = [f"{i} {l}" for i,l in enumerate(lab)]

fig = go.Figure(go.Sankey(
    orientation = "h",
    arrangement="snap",
    valueformat=".5r",
    node=dict(
        customdata=multirefs,
        hovertemplate='%{customdata} - Value: %{value}<extra></extra>',
        align="left",
        label=lab,
        pad=50
    ),
    link=dict(
       source=un,
       target=ov,
       value=vl
    )
))
fig.update_layout(font_size=10, width=900, height=900)

fig.show()

In [None]:
n_tok = len(dfs["output"].columns)

attentions = compute_batch_complete_padded_attentions(generated_output, range(0, 32))

# for key, df in dfs.items():        
nodexs = []
nodeys = []
labels = []
# Cycle through every layer of the model, gathering all blocks as nodes
for idx, row in dfs["output"].iterrows():
    # Generate coordinates for nodes
    xs = [i for i in range(len(row))]
    ys = [idx] * len(row)

    nodexs.append(xs)
    nodeys.append(ys)

    for x, y in zip(xs, ys):
        labels.append(row.iloc[x])

s = np.repeat([i for i in range(0, len(labels) - n_tok)], n_tok).astype(int).tolist()
t = np.array([[j for j in range(i, i + n_tok)] * n_tok for i in range(n_tok, len(labels), n_tok)]).flatten().astype(int).tolist()
v = np.array([[attentions[-1][i][j][::-1] for j in range(0, n_tok)][::-1] for i in reversed(range(0, 32))]).flatten()[::-1].astype(float).tolist()
s = [el_s for el_s, el_v in zip(s, v) if el_v > 0]
t = [el_t for el_t, el_v in zip(t, v) if el_v > 0]
v = [el_v for el_v in v if el_v > 0]

#nodexs = np.array(nodexs).flatten() 
#nodexs = (nodexs / nodexs.max()) * 0.1 + 0.0000001
#nodeys = np.array(nodeys).flatten()
#nodeys = (nodeys / nodeys.max()) * 3 + 0.0000001

# Vertical
#fig = go.Figure(go.Sankey(
#    orientation = "v",
#    arrangement='fixed',
#    valueformat=".5r",
#    node=dict(
#        align="left",
#        label=[f"{i} {l}" for i,l in enumerate(labels)],
#        y=nodexs,
#        x=nodeys[::-1],
#        pad=50
#    ),
#    link=dict(
#       source=s,
#       target=t,
#       value=v
#    )
#))
#fig.update_layout(font_size=10, width=1000, height=2700)

y_offset = 0

nodexs = np.array(nodexs).flatten() 
nodexs = (nodexs / nodexs.max()) + 0.0000001
nodeys = np.array(nodeys).flatten()
nodeys = (nodeys / nodeys.max() - y_offset) + 0.0000001

if multirep:
    multirefs = labels
    labels = [l[0] for l in labels]
    labels = [f"{i} {l}" for i,l in enumerate(labels)]

fig = go.Figure(go.Sankey(
    orientation = "h",
    arrangement="fixed",
    valueformat=".5r",
    node=dict(
        customdata=multirefs,
        hovertemplate='%{customdata} - Value: %{value}<extra></extra>',
        align="left",
        label=labels,
        y=nodexs,
        x=nodeys,
        pad=80
    ),
    link=dict(
       source=t,
       target=s,
       value=v
    )
))
fig.update_layout(font_size=10, width=4000, height=1000)

fig.show()