# ✎ Load Model

## Overview

This notebook aims at illustrating on how to instantiate models in fairseq2 using LLaMA model as an example.

In [1]:
from fairseq2 import setup_fairseq2

# Always call setup_fairseq2() before using any fairseq2 functionality
setup_fairseq2()

All models in fairseq2 inherit from PyTorch's `nn.Module`, providing standard PyTorch funtionality. The configuration can be easily customized.

In [2]:
from fairseq2.models.llama import LLaMAConfig, LLaMAFactory
from fairseq2.data import VocabularyInfo

custom_config = LLaMAConfig(
    model_dim=2048,                    # Model dimension
    max_seq_len=4096,                  # Maximum sequence length
    vocab_info=VocabularyInfo(
        size=32000,                    # Vocabulary size
        unk_idx=0,                     # Unknown index
        bos_idx=1,                     # Beginning of sequence index
        eos_idx=2,                     # End of sequence index
        pad_idx=None                   # Padding index
    ),
    num_layers=16,                     # Number of transformer layers
    num_attn_heads=32,                 # Number of attention heads
    num_key_value_heads=8,             # Number of key/value heads
    ffn_inner_dim=2048 * 4,            # FFN inner dimension
    dropout_p=0.1                      # Dropout probability
)

model = LLaMAFactory(custom_config).create_model()
model

TransformerDecoderModel(
  model_dim=2048
  (decoder_frontend): TransformerEmbeddingFrontend(
    model_dim=2048
    (embed): StandardEmbedding(num_embeddings=32000, embedding_dim=2048)
    (pos_encoder): None
    (layer_norm): None
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): StandardTransformerDecoder(
    model_dim=2048, self_attn_mask_factory=CausalAttentionMaskFactory(), norm_order=PRE
    (layers): ModuleList(
      (0-15): 16 x StandardTransformerDecoderLayer(
        model_dim=2048, norm_order=PRE
        (self_attn_layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)
        (self_attn): StandardMultiheadAttention(
          num_heads=32, model_dim=2048, num_key_value_heads=8
          (q_proj): Linear(input_dim=2048, output_dim=2048, bias=False, init_fn=init_qkv_projection)
          (k_proj): Linear(input_dim=2048, output_dim=512, bias=False, init_fn=init_qkv_projection)
          (v_proj): Linear(input_dim=2048, output_dim=512,

You can also fetch some config presets from model hub.

In [8]:
from fairseq2.models.llama import get_llama_model_hub, LLaMAFactory

model_hub = get_llama_model_hub()
model_config = model_hub.load_config("llama3_2_1b")  # use llama3.2 1b preset as an example

In [9]:
model_config

LLaMAConfig(model_dim=2048, max_seq_len=131072, vocab_info=VocabularyInfo(size=128256, unk_idx=None, bos_idx=128000, eos_idx=128001, pad_idx=None, boh_idx=None, eoh_idx=None), num_layers=16, num_attn_heads=32, num_key_value_heads=8, ffn_inner_dim=8192, ffn_inner_dim_scale=0.6666666666666666, ffn_inner_dim_multiplier=1.5, ffn_inner_dim_to_multiple=256, rope_theta=500000.0, use_scaled_rope=True, rope_scaling=LLaMARopeScalingConfig(factor=32.0, frequency_factors=(1.0, 4.0), original_context_length=8192), dropout_p=0.1)

In [10]:
llama_model = LLaMAFactory(model_config).create_model()
llama_model

TransformerDecoderModel(
  model_dim=2048
  (decoder_frontend): TransformerEmbeddingFrontend(
    model_dim=2048
    (embed): StandardEmbedding(num_embeddings=128256, embedding_dim=2048)
    (pos_encoder): None
    (layer_norm): None
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): StandardTransformerDecoder(
    model_dim=2048, self_attn_mask_factory=CausalAttentionMaskFactory(), norm_order=PRE
    (layers): ModuleList(
      (0-15): 16 x StandardTransformerDecoderLayer(
        model_dim=2048, norm_order=PRE
        (self_attn_layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)
        (self_attn): StandardMultiheadAttention(
          num_heads=32, model_dim=2048, num_key_value_heads=8
          (q_proj): Linear(input_dim=2048, output_dim=2048, bias=False, init_fn=init_qkv_projection)
          (k_proj): Linear(input_dim=2048, output_dim=512, bias=False, init_fn=init_qkv_projection)
          (v_proj): Linear(input_dim=2048, output_dim=512

To check what are the registered models, we can leverage the `asset_store` in our runtime context.

In [5]:
from fairseq2.context import get_runtime_context
context = get_runtime_context()
asset_store = context.asset_store

In [6]:
[asset for asset in asset_store.retrieve_names() if "llama3_1" in asset]

['llama3_1_8b@',
 'llama3_1_8b_instruct@',
 'llama3_1_70b@',
 'llama3_1_70b_instruct@',
 'llama3_1_8b@awscluster',
 'llama3_1_8b@aws-h100-2',
 'llama3_1_8b_instruct@faircluster',
 'llama3_1_8b_instruct@awscluster',
 'llama3_1_8b_instruct@aws-h100-2',
 'llama3_1_70b@awscluster',
 'llama3_1_70b@aws-h100-2',
 'llama3_1_70b_instruct@faircluster',
 'llama3_1_70b_instruct@awscluster',
 'llama3_1_70b_instruct@aws-h100-2']

Loading pretrained model can also be done directly from the hub.

In [7]:
from fairseq2.models.llama import get_llama_model_hub

model_hub = get_llama_model_hub()
# Load a pre-trained model from the hub
model = model_hub.load("llama3_2_1b")  # here llama3_2_1b needs to be a registered asset card
model

TransformerDecoderModel(
  model_dim=2048
  (decoder_frontend): TransformerEmbeddingFrontend(
    model_dim=2048
    (embed): StandardEmbedding(num_embeddings=128256, embedding_dim=2048)
    (pos_encoder): None
    (layer_norm): None
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): StandardTransformerDecoder(
    model_dim=2048, self_attn_mask_factory=CausalAttentionMaskFactory(), norm_order=PRE
    (layers): ModuleList(
      (0-15): 16 x StandardTransformerDecoderLayer(
        model_dim=2048, norm_order=PRE
        (self_attn_layer_norm): RMSNorm(normalized_shape=(2048,), eps=1E-05, elementwise_affine=True)
        (self_attn): StandardMultiheadAttention(
          num_heads=32, model_dim=2048, num_key_value_heads=8
          (q_proj): Linear(input_dim=2048, output_dim=2048, bias=False, init_fn=init_qkv_projection)
          (k_proj): Linear(input_dim=2048, output_dim=512, bias=False, init_fn=init_qkv_projection)
          (v_proj): Linear(input_dim=2048, output_dim=512