In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("eljanmahammadli/AzLlama-152M-Alpaca")
model = AutoModelForCausalLM.from_pretrained("eljanmahammadli/AzLlama-152M-Alpaca")

LlamaModel(
  (embed_tokens): Embedding(32000, 768)
  (layers): ModuleList(
    (0-11): 12 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (k_proj): Linear(in_features=768, out_features=256, bias=False)
        (v_proj): Linear(in_features=768, out_features=256, bias=False)
        (o_proj): Linear(in_features=768, out_features=768, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=768, out_features=3072, bias=False)
        (up_proj): Linear(in_features=768, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=768, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm()
      (post_attention_layernorm): LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm()
)

In [12]:
import torch
import torch.nn as nn


# Function to count parameters per layer
def get_parameters_per_layer(model):
    param_info = []
    for name, param in model.named_parameters():
        if param.requires_grad:
            param_info.append((name, param.numel()))
    return param_info

# Getting parameter details
parameter_details = get_parameters_per_layer(model.model)

# Printing parameter details (Python)
for name, num_params in parameter_details:
    print(f"{name}: {num_params} parameters")


embed_tokens.weight: 24576000 parameters
layers.0.self_attn.q_proj.weight: 589824 parameters
layers.0.self_attn.k_proj.weight: 196608 parameters
layers.0.self_attn.v_proj.weight: 196608 parameters
layers.0.self_attn.o_proj.weight: 589824 parameters
layers.0.mlp.gate_proj.weight: 2359296 parameters
layers.0.mlp.up_proj.weight: 2359296 parameters
layers.0.mlp.down_proj.weight: 2359296 parameters
layers.0.input_layernorm.weight: 768 parameters
layers.0.post_attention_layernorm.weight: 768 parameters
layers.1.self_attn.q_proj.weight: 589824 parameters
layers.1.self_attn.k_proj.weight: 196608 parameters
layers.1.self_attn.v_proj.weight: 196608 parameters
layers.1.self_attn.o_proj.weight: 589824 parameters
layers.1.mlp.gate_proj.weight: 2359296 parameters
layers.1.mlp.up_proj.weight: 2359296 parameters
layers.1.mlp.down_proj.weight: 2359296 parameters
layers.1.input_layernorm.weight: 768 parameters
layers.1.post_attention_layernorm.weight: 768 parameters
layers.2.self_attn.q_proj.weight: 589

In [5]:
# get total number of parameters
total_params = sum([num_params for name, num_params in parameter_details])
total_params

128404224

In [7]:
import torch
import torch.nn as nn



# Function to safely format layer names for LaTeX
def latex_safe_string(input_string):
    return input_string.replace('_', '\\_')

# Function to generate LaTeX table for model parameters
def generate_latex_table(model):
    latex_code = "\\begin{table}[h]\n\\centering\n\\begin{tabular}{|c|c|}\n\\hline\n"
    latex_code += "\\textbf{Layer Name} & \\textbf{Number of Parameters} \\\\\n\\hline\n"
    
    for name, param in model.named_parameters():
        if param.requires_grad:
            # Ensure layer names are LaTeX safe
            safe_name = latex_safe_string(name)
            latex_code += f"{safe_name} & {param.numel()} \\\\\n\\hline\n"
    
    latex_code += "\\end{tabular}\n\\caption{Number of parameters per layer}\n\\label{tab:parameters}\n\\end{table}\n"
    
    return latex_code

# Generate and print LaTeX code
latex_output = generate_latex_table(model.model)
print(latex_output)


\begin{table}[h]
\centering
\begin{tabular}{|c|c|}
\hline
\textbf{Layer Name} & \textbf{Number of Parameters} \\
\hline
embed\_tokens.weight & 24576000 \\
\hline
layers.0.self\_attn.q\_proj.weight & 589824 \\
\hline
layers.0.self\_attn.k\_proj.weight & 196608 \\
\hline
layers.0.self\_attn.v\_proj.weight & 196608 \\
\hline
layers.0.self\_attn.o\_proj.weight & 589824 \\
\hline
layers.0.mlp.gate\_proj.weight & 2359296 \\
\hline
layers.0.mlp.up\_proj.weight & 2359296 \\
\hline
layers.0.mlp.down\_proj.weight & 2359296 \\
\hline
layers.0.input\_layernorm.weight & 768 \\
\hline
layers.0.post\_attention\_layernorm.weight & 768 \\
\hline
layers.1.self\_attn.q\_proj.weight & 589824 \\
\hline
layers.1.self\_attn.k\_proj.weight & 196608 \\
\hline
layers.1.self\_attn.v\_proj.weight & 196608 \\
\hline
layers.1.self\_attn.o\_proj.weight & 589824 \\
\hline
layers.1.mlp.gate\_proj.weight & 2359296 \\
\hline
layers.1.mlp.up\_proj.weight & 2359296 \\
\hline
layers.1.mlp.down\_proj.weight & 2359296 \\
\hl

In [11]:
import torch
import torch.nn as nn
from collections import OrderedDict
from collections import defaultdict


# Function to generate LaTeX table for model parameters
def generate_latex_table(model):
    layer_summary = defaultdict(lambda: {'count': 0, 'params': 0})
    
    for name, module in model.named_modules():
        params = sum(p.numel() for p in module.parameters() if p.requires_grad)
        if params > 0:
            layer_type = str(type(module)).split(".")[-1].split("'")[0]
            layer_summary[layer_type]['count'] += 1
            layer_summary[layer_type]['params'] += params

    # Begin LaTeX table code
    latex_code = "\\begin{landscape}\n"
    latex_code += "\\begin{table}[H]\n\\centering\n\\begin{tabular}{|c|c|}\n\\hline\n"
    latex_code += "\\textbf{Layer Type} & \\textbf{Number of Parameters} \\\\\n\\hline\n"

    # Fill in table with layer information
    for layer_type, info in layer_summary.items():
        latex_code += f"{layer_type} (x{info['count']}) & {info['params']} \\\\\n\\hline\n"
    
    latex_code += "\\end{tabular}\n\\end{table}\n\\end{landscape}\n"

    return latex_code

# Generate and print LaTeX code
latex_output = generate_latex_table(model.model)
print(latex_output)

\begin{landscape}
\begin{table}[H]
\centering
\begin{tabular}{|c|c|}
\hline
\textbf{Layer Type} & \textbf{Number of Parameters} \\
\hline
LlamaModel (x1) & 128404224 \\
\hline
Embedding (x1) & 24576000 \\
\hline
ModuleList (x1) & 103827456 \\
\hline
LlamaDecoderLayer (x12) & 103827456 \\
\hline
LlamaSdpaAttention (x12) & 18874368 \\
\hline
Linear (x84) & 103809024 \\
\hline
LlamaMLP (x12) & 84934656 \\
\hline
LlamaRMSNorm (x25) & 19200 \\
\hline
\end{tabular}
\end{table}
\end{landscape}



In [13]:
from collections import defaultdict

def group_parameters_by_layer_type(parameter_details):
    # Define categories for grouping
    categories = {
        'embedding': ['embed_tokens'],
        'attention': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'self_attn'],
        'mlp': ['gate_proj', 'up_proj', 'down_proj'],
        'normalization': ['input_layernorm', 'post_attention_layernorm', 'norm'],
        'output': ['lm_head']
    }
    
    # Initialize the dictionary for grouped data
    grouped_data = defaultdict(int)
    
    # Group data
    for name, count in parameter_details:
        for category, keywords in categories.items():
            if any(key in name for key in keywords):
                grouped_data[category] += count
                break
    
    return grouped_data



grouped_parameters = group_parameters_by_layer_type(parameter_details)

# Generate LaTeX code
def generate_latex_table(grouped_data):
    latex = "\\begin{table}[H]\n\\centering\n\\begin{tabular}{|c|c|}\n\\hline\n"
    latex += "Layer Type & Number of Parameters \\\\\n\\hline\n"
    for category, count in grouped_data.items():
        latex += f"{category} & {count} \\\\\n"
    latex += "\\hline\n\\end{tabular}\n\\caption{Grouped parameter counts by layer type}\n\\end{table}"
    return latex

latex_code = generate_latex_table(grouped_parameters)
print(latex_code)


\begin{table}[H]
\centering
\begin{tabular}{|c|c|}
\hline
Layer Type & Number of Parameters \\
\hline
embedding & 24576000 \\
attention & 18874368 \\
mlp & 84934656 \\
normalization & 19200 \\
\hline
\end{tabular}
\caption{Grouped parameter counts by layer type}
\end{table}


In [17]:
def detailed_layer_info(parameter_details):
    # Initialize dictionaries for grouping layers and their parameters
    grouped_data = []
    
    # Process each parameter detail entry
    for name, count in parameter_details:
        # Simplify the layer naming by removing model prefix and handling sublayers
        simplified_name = name.replace('model.', '')
        parts = simplified_name.split('.')
        if 'layers' in parts:
            # Capture layer number and sub-layer detail
            layer_number = parts[1]
            sub_layer_name = '.'.join(parts[2:])
            layer_label = f"LlamaDecoderLayer {layer_number} - {sub_layer_name}"
            group = f"LlamaDecoderLayer {layer_number}"
        elif 'embed_tokens' in parts:
            layer_label = 'Embedding'
            group = 'Embedding'
        elif 'lm_head' in parts:
            layer_label = 'Output Head'
            group = 'Output Head'
        else:
            layer_label = simplified_name
            group = 'Miscellaneous'
        
        # Append data with handling to ensure unique layer descriptions
        grouped_data.append((layer_label, count, group))

    return grouped_data


grouped_parameters = detailed_layer_info(parameter_details)

def generate_latex_table(grouped_parameters):
    latex = "\\begin{table}[H]\n\\centering\n\\begin{tabular}{|l|r|l|}\n\\hline\n"
    latex += "Layer Name & Number of Parameters & Group \\\\\n\\hline\n"
    for layer_name, param_count, group in grouped_parameters:
        # Ensure that underscores are properly escaped for LaTeX
        safe_layer_name = layer_name.replace('_', '\\_')
        latex += f"{safe_layer_name} & {param_count:,} & {group} \\\\\n"
    latex += "\\hline\n\\end{tabular}\n\\caption{Detailed parameter counts by layer and group}\n\\end{table}"
    return latex

latex_code = generate_latex_table(grouped_parameters)
print(latex_code)


\begin{table}[H]
\centering
\begin{tabular}{|l|r|l|}
\hline
Layer Name & Number of Parameters & Group \\
\hline
Embedding & 24,576,000 & Embedding \\
LlamaDecoderLayer 0 - self\_attn.q\_proj.weight & 589,824 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 0 - self\_attn.k\_proj.weight & 196,608 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 0 - self\_attn.v\_proj.weight & 196,608 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 0 - self\_attn.o\_proj.weight & 589,824 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 0 - mlp.gate\_proj.weight & 2,359,296 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 0 - mlp.up\_proj.weight & 2,359,296 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 0 - mlp.down\_proj.weight & 2,359,296 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 0 - input\_layernorm.weight & 768 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 0 - post\_attention\_layernorm.weight & 768 & LlamaDecoderLayer 0 \\
LlamaDecoderLayer 1 - self\_attn.q\_proj.weight & 589,824 & LlamaDecoderLayer 1 \\
LlamaDecoderLayer 1 - self\_at