In [1]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
from typing import List

from mistral.model import ModelArgs, Transformer
from main import generate

from prettytable import PrettyTable

In [2]:
def model_size_and_parameters(model):
    # Create a PrettyTable for displaying module-wise parameter information
    table = PrettyTable(["Modules", "Parameters"])

    # Calculate the total number of parameters in the model
    model_size = sum(t.numel() for t in model.parameters())

    # Print the total size of the model in megabytes
    print(f"Model size: {model_size/1000**2:.1f}M parameters")
    return f'{model_size/1000**2:.1f}'

    # Initialize a variable to keep track of the total trainable parameters
    total_params = 0

    # Iterate through named parameters of the model
    for name, parameter in model.named_parameters():
        # Check if the parameter requires gradient (i.e., is trainable)
        if not parameter.requires_grad:
            continue

        # Get the number of parameters in the current module
        params = parameter.numel()

        # Add a row to the PrettyTable with module name and number of parameters
        table.add_row([name, params])

        # Increment the total trainable parameters
        total_params += params

    # Print the PrettyTable with module-wise parameter information
    print(table)

    # Print the total number of trainable parameters in the model
    print(f"Total Trainable Params: {total_params} or {total_params/1000**2:.2f}M parameters")

    # Return the total number of trainable parameters
    return total_params

In [3]:
class DebugTokenizer:
    @property
    def bos_id(self) -> int:
        return 0

    @property
    def eos_id(self) -> int:
        return 1

    @property
    def pad_id(self) -> int:
        return -1

    def encode(self, s: str, bos: bool = True) -> List[int]:
        assert isinstance(s, str)
        t = [int(x) for x in s.split()]
        if bos:
            t = [self.bos_id, *t]
        return t

    def decode(self, t: List[int]) -> str:
        return " ".join([str(x) for x in t])


In [4]:

def model_stats(string):
    demb,  n_layer, d_head,d_FF,n_head,kv_heads,Window,vocab = string.split()
    args = ModelArgs(
        dim=int(demb),
        n_layers=int(n_layer),
        head_dim=int(d_head),
        hidden_dim=int(d_FF),
        n_heads=int(n_head),
        n_kv_heads=int(kv_heads),
        sliding_window=int(Window),
        norm_eps=1e-5,
        vocab_size=int(vocab),
        max_batch_size=1,
    )

    model = Transformer(args).to("cuda", dtype=torch.float32)
    model_size_and_parameters(model)
    # print the cuda memory usage
    print(torch.cuda.memory_summary(device=None, abbreviated=False))
    del model



In [5]:
string =  "4096	32 128	14336	32	8 4096	32000"
demb,  n_layer, d_head,d_FF,n_head,kv_heads,Window,vocab = string.split()
args = ModelArgs(
    dim=int(demb),
    n_layers=int(n_layer),
    head_dim=int(d_head),
    hidden_dim=int(d_FF),
    n_heads=int(n_head),
    n_kv_heads=int(kv_heads),
    sliding_window=int(Window),
    norm_eps=1e-5,
    vocab_size=int(vocab),
    max_batch_size=1,
)

model = Transformer(args).to("cuda", dtype=torch.float32)
model_size_and_parameters(model)
#Returns the global free and total GPU memory for a given device using cudaMemGetInfo.
a,b = torch.cuda.mem_get_info()
gpu_mem_usage = (b-a)/(2**20)
print(f"GPU memory usage: {gpu_mem_usage:.2f} MB")

# release all the gpu memory




Model size: 7241.7M parameters
GPU memory usage: 28004.25 MB


NameError: name 'model' is not defined

In [1]:
import pandas as pd

df = pd.read_csv("/home/dosisiddhesh/MISTRAL_EXP/Github_repo/mistral-src/model_stats2.csv")
df

Unnamed: 0,ModelParam_M,D_emb,Vocal,D_Head,d_FF,N_Layer,N_Head,KV_Head,Window,GPU_use_MB
0,1624.5,4096,100000,128,14336,4,16,8,4096,6580.25
1,1825.9,4096,100000,128,14336,5,16,8,4096,7348.25
2,2027.2,4096,100000,128,14336,6,16,8,4096,8116.25
3,2429.9,4096,100000,128,14336,8,16,8,4096,9652.25
4,2832.6,4096,100000,128,14336,10,16,8,4096,11188.25
5,3235.2,4096,100000,128,14336,12,16,8,4096,12724.25
6,1607.8,4096,100000,128,14336,4,16,4,4096,6532.25
7,1804.9,4096,100000,128,14336,5,16,4,4096,7288.25
8,2002.0,4096,100000,128,14336,6,16,4,4096,8044.25
9,2396.3,4096,100000,128,14336,8,16,4,4096,9556.25


In [4]:
# sort the dataframe by model size
df2 = df.sort_values("ModelParam_M")

In [6]:
df2.to_csv("/home/dosisiddhesh/MISTRAL_EXP/Github_repo/mistral-src/model_stats2.csv", index=False)

In [7]:
df2

Unnamed: 0,ModelParam_M,D_emb,Vocal,D_Head,d_FF,N_Layer,N_Head,KV_Head,Window,GPU_use_MB
6,1607.8,4096,100000,128,14336,4,16,4,4096,6532.25
18,1607.8,4096,100000,128,14336,4,16,4,8192,6532.25
12,1624.5,4096,100000,128,14336,4,16,8,8192,6580.25
0,1624.5,4096,100000,128,14336,4,16,8,4096,6580.25
30,1674.9,4096,100000,128,14336,4,32,4,4096,6788.25
42,1674.9,4096,100000,128,14336,4,32,4,8192,6788.25
36,1691.7,4096,100000,128,14336,4,32,8,8192,6836.25
24,1691.7,4096,100000,128,14336,4,32,8,4096,6836.25
19,1804.9,4096,100000,128,14336,5,16,4,8192,7288.25
7,1804.9,4096,100000,128,14336,5,16,4,4096,7288.25


In [4]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
config = AutoConfig.from_pretrained("mistralai/Mistral-7B-v0.1")
from transformers import MistralConfig
ModelParam_M,D_emb,Vocal,D_Head,d_FF,N_Layer,N_Head,KV_Head,Window,GPU_use_MB = 1607.8,4096,50000,128,14336,4,16,4,4096,6532.25
custom_config = MistralConfig(
        vocab_size=Vocal,
        hidden_size=D_emb,
        intermediate_size=d_FF,
        num_hidden_layers=N_Layer,
        num_attention_heads=N_Head,
        num_key_value_heads=KV_Head,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        sliding_window=Window,
        attention_dropout=0.0,
    )

model_custom = AutoModelForCausalLM.from_config(custom_config)
model_custom.num_parameters()


1282052096

In [6]:
import torch

In [7]:
model_custom.to("cuda", dtype=torch.float32)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(50000, 4096)
    (layers): ModuleList(
      (0-3): 4 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSN