In [1]:
import os

os.environ['VISIBLE_CUDA_DEVICES'] = '1'

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)


In [3]:
# load model mistralai/Mistral-7B-v0.1 and tokenizer
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device='cuda:0')
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

In [6]:
from prettytable import PrettyTable

def model_size_and_parameters(model):
    # Create a PrettyTable for displaying module-wise parameter information
    table = PrettyTable(["Modules", "Parameters"])

    # Calculate the total number of parameters in the model
    model_size = sum(t.numel() for t in model.parameters())

    # Print the total size of the model in megabytes
    print(f"bert-base-uncased size: {model_size/1000**2:.1f}M parameters")

    # Initialize a variable to keep track of the total trainable parameters
    total_params = 0

    # Iterate through named parameters of the model
    for name, parameter in model.named_parameters():
        # Check if the parameter requires gradient (i.e., is trainable)
        if not parameter.requires_grad:
            continue

        # Get the number of parameters in the current module
        params = parameter.numel()

        # Add a row to the PrettyTable with module name and number of parameters
        table.add_row([name, params])

        # Increment the total trainable parameters
        total_params += params

    # Print the PrettyTable with module-wise parameter information
    print(table)

    # Print the total number of trainable parameters in the model
    print(f"Total Trainable Params: {total_params}")

    # Return the total number of trainable parameters
    return total_params

model_size_and_parameters(model)


bert-base-uncased size: 7241.7M parameters
+-------------------------------------------------+------------+
|                     Modules                     | Parameters |
+-------------------------------------------------+------------+
|            model.embed_tokens.weight            | 131072000  |
|      model.layers.0.self_attn.q_proj.weight     |  16777216  |
|      model.layers.0.self_attn.k_proj.weight     |  4194304   |
|      model.layers.0.self_attn.v_proj.weight     |  4194304   |
|      model.layers.0.self_attn.o_proj.weight     |  16777216  |
|       model.layers.0.mlp.gate_proj.weight       |  58720256  |
|        model.layers.0.mlp.up_proj.weight        |  58720256  |
|       model.layers.0.mlp.down_proj.weight       |  58720256  |
|      model.layers.0.input_layernorm.weight      |    4096    |
|  model.layers.0.post_attention_layernorm.weight |    4096    |
|      model.layers.1.self_attn.q_proj.weight     |  16777216  |
|      model.layers.1.self_attn.k_proj.weight  

7241732096

In [4]:
model.to('cuda:1')

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

# Loading mistral model with custom config from huggingface

In [16]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
config = AutoConfig.from_pretrained("mistralai/Mistral-7B-v0.1")
from transformers import MistralConfig
ModelParam_M,D_emb,Vocal,D_Head,d_FF,N_Layer,N_Head,KV_Head,Window,GPU_use_MB = 1607.8,4096,50000,128,14336,4,16,4,4096,6532.25
custom_config = MistralConfig(
        vocab_size=Vocal,
        hidden_size=D_emb,
        intermediate_size=d_FF,
        num_hidden_layers=N_Layer,
        num_attention_heads=N_Head,
        num_key_value_heads=KV_Head,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        sliding_window=Window,
        attention_dropout=0.0,
    )

model_custom = AutoModelForCausalLM.from_config(custom_config)
model_custom.num_parameters()


1282052096

In [1]:
from transformers import MistralForCausalLM, MistralConfig


In [3]:
D_emb,Vocal,d_head,d_FF,N_Layer,N_Head,KV_Head,Window = 4096,50000,128,14336,2,32,8,8192

custom_config = MistralConfig(
        vocab_size=Vocal,
        hidden_size=D_emb,
        intermediate_size=d_FF,
        num_hidden_layers=N_Layer,
        num_attention_heads=N_Head,
        num_key_value_heads=KV_Head,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        sliding_window=Window,
        attention_dropout=0.0,
    )

In [4]:
model = MistralForCausalLM(custom_config)

In [12]:
model.num_parameters()/1000**2

845.828096

In [3]:
import os

os.environ['VISIBLE_CUDA_DEVICES'] = '1'

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

# load model mistralai/Mistral-7B-v0.1 and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-125m")







tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

In [7]:
tokenizer.get_vocab

<bound method PreTrainedTokenizerFast.get_vocab of PreTrainedTokenizerFast(name_or_path='facebook/galactica-125m', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[START_REF]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("[END_REF]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	6: AddedToken("[IMAGE]", rstrip=False, lstrip

In [14]:
# get all the special tokens
tokenizer.added_tokens_encoder

{'<s>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 '[START_REF]': 4,
 '[END_REF]': 5,
 '[IMAGE]': 6,
 '<fragments>': 7,
 '</fragments>': 8,
 '<work>': 9,
 '</work>': 10,
 '[START_SUP]': 11,
 '[END_SUP]': 12,
 '[START_SUB]': 13,
 '[END_SUB]': 14,
 '[START_DNA]': 15,
 '[END_DNA]': 16,
 '[START_AMINO]': 17,
 '[END_AMINO]': 18,
 '[START_SMILES]': 19,
 '[END_SMILES]': 20,
 '[START_I_SMILES]': 21,
 '[END_I_SMILES]': 22}

In [15]:
tokenizer.added_tokens_decoder

{0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 4: AddedToken("[START_REF]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 5: AddedToken("[END_REF]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 6: AddedToken("[IMAGE]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 7: AddedToken("<fragments>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 8: AddedToken("</fragments>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 9: AddedToken("<work>", rstrip=False, lst

In [16]:
tokenizer_m = AutoTokenizer.from_pretrained('/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_2.0%_50000_new')
tokenizer_m.get_vocab


<bound method PreTrainedTokenizerFast.get_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_2.0%_50000_new', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}>

In [17]:
tokenizer_m.added_tokens_encoder

{'<unk>': 0}

In [18]:
tokenizer_m.added_tokens_decoder

{0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)}

In [24]:
tokenizer_m.batch_decode([[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]])

['<unk>\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\x0e\x0f']

In [25]:
tokenizer_m2 = AutoTokenizer.from_pretrained('/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new')
tokenizer_m2.get_vocab

<bound method PreTrainedTokenizerFast.get_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[UNK] ", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	6: AddedT

In [26]:
tokenizer_m2.added_tokens_encoder


{'[PAD]': 0,
 '[CLS]': 1,
 '[SEP]': 2,
 '[MASK]': 3,
 '[UNK] ': 4,
 '[BOS]': 5,
 '[EOS]': 6}

In [29]:
tokenizer_m2.added_tokens_decoder

{0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 4: AddedToken("[UNK] ", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 5: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
 6: AddedToken("[EOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)}

In [34]:
tokenizer_m2.pad_token = tokenizer_m2.added_tokens_decoder[0]
tokenizer_m2.cls_token = tokenizer_m2.added_tokens_decoder[1]
tokenizer_m2.sep_token = tokenizer_m2.added_tokens_decoder[2]
tokenizer_m2.mask_token = tokenizer_m2.added_tokens_decoder[3]
tokenizer_m2.unk_token = tokenizer_m2.added_tokens_decoder[4]
tokenizer_m2.bos_token = tokenizer_m2.added_tokens_decoder[5]
tokenizer_m2.eos_token = tokenizer_m2.added_tokens_decoder[6]


0

In [36]:
tokenizer_m2.get_added_vocab

<bound method PreTrainedTokenizerFast.get_added_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK] ', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[UNK] ", rstrip=False, lstrip=False, singl

In [59]:
tokenizer_m3 = AutoTokenizer.from_pretrained('/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new')
tokenizer_m3.get_vocab

<bound method PreTrainedTokenizerFast.get_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	6: AddedTo

In [60]:
tokenizer_m3.get_added_vocab

<bound method PreTrainedTokenizerFast.get_added_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("[BOS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	6: A

In [61]:
tokenizer_m3.unk_token

In [62]:
tokenizer_m3.pad_token_id


In [63]:
tokenizer_m3.bos_token_id

In [64]:
tokenizer_m3.add_special_tokens({'pad_token': '[PAD]',
                                                'cls_token': '[CLS]',
                                                'sep_token': '[SEP]',
                                                'mask_token': '[MASK]',
                                                'unk_token': '[UNK]',
                                                'bos_token': '[BOS]',
                                                'eos_token': '[EOS]'})
tokenizer_m3.get_added_vocab

<bound method PreTrainedTokenizerFast.get_added_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[UNK]", rstrip=False, lstrip=False, single_

In [65]:
tokenizer_m3.unk_token

'[UNK]'

In [70]:
tokenizer_m4 = AutoTokenizer.from_pretrained('/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new')
tokenizer_m4.get_vocab

<bound method PreTrainedTokenizerFast.get_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<cls>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<sep>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	6: AddedTo

In [71]:
tokenizer_m4.bos_token = "<bos>"

In [72]:
tokenizer_m4.get_added_vocab

<bound method PreTrainedTokenizerFast.get_added_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<bos>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<cls>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<sep>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, 

In [73]:
tokenizer_m5 = AutoTokenizer.from_pretrained('/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_0.002%_50000_new')
tokenizer_m5.get_vocab

<bound method PreTrainedTokenizerFast.get_vocab of PreTrainedTokenizerFast(name_or_path='/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_0.002%_50000_new', vocab_size=15505, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<cls>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<sep>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	6: Added

In [75]:
tokenizer_m5.unk_token

# loading model on multiple GPU

In [1]:
import os
device = '1'
os.environ["CUDA_VISIBLE_DEVICES"]='0,1,2,3'
os.environ["CUDA_LAUNCH_BLOCKING"]='1'

import sys
import time
import tqdm
import torch
from pathlib import Path
from evaluate import load
from transformers import (
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling, 
    EarlyStoppingCallback, 
#    WandbCallback,
)
os.environ['WANDB_DISABLED'] = 'true'
isf16 = False


# metric = load("perplexity")
code_path = "/home/dosisiddhesh/MISTRAL_EXP/mistral-src"
data_path = "/home/dosisiddhesh/MISTRAL_EXP/data/latex.csv"
model_path = Path("/home/dosisiddhesh/MISTRAL_EXP/model/mistral-7B-v0.1")  # model and tokenizer location
# tokenizer_path_sentence_piece_for_mistral_src = '/home/dosisiddhesh/MISTRAL_EXP/model/tokenizer_5.0%_50000_new.model'
# tokenizer_path_hf_debertv2 = "/home/dosisiddhesh/MISTRAL_EXP/model/tokenizer_5.0%_50000_hf.model"
# tokenizer_path_llama = "hf-internal-testing/llama-tokenizer" #llama
tokenizer_path_hf_our = '/home/dosisiddhesh/MISTRAL_EXP/model/hf_tokenizer_4.0%_50000_new'


sys.path.append(code_path)  # append the path where mistral-src was cloned
from mistral.tokenizer import Tokenizer
from mistral.model import Transformer, ModelArgs
from training_utils import Parameter, MyModel, Dataset_Preprocessing, HyperParams
#__________________________________________________________________________________________________
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
D_emb = 4096
Vocal = 50000
d_head = 128
d_FF = 7168 #14336
N_Layer = 4
N_Head = 32
KV_Head = 8
Window = 4096 #8192
data_row = 100
value = [D_emb,Vocal,d_head,d_FF,N_Layer,N_Head,KV_Head,Window]
#**************************************************************************************************
param = Parameter("Mistral", value)
hp = HyperParams(
    epoch=1, 
    learning_rate=6e-4, 
    model_id="mistral/dummy",
    weight_decay=0.1,  
    warmup_steps=50,
    lr_scheduler_type="linear", #['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup', 'inverse_sqrt', 'reduce_lr_on_plateau']
    BATCH_SIZE=8,
    tokenizer_batch_size=16,
    eval_steps=50, # Adjust as needed1
    logging_steps=50,  # Adjust as needed
    save_steps=200,
    save_total_limit = 1,
    max_seq_length=int(1024*2),
)
#__________________________________________________________________________________________________
# +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+


# import wandb
# wandb.login()
# os.environ["WANDB_PROJECT"]="Misral"
# WANDB_PROJECT="Misral_sci_tex"
# wandb_run_name = "dummy"

#____________________________________________________________________________________________________________________________
# In[]: GPU stats ***********************************************************************************************************
from pynvml import *

# def print_gpu_utilization():
#     nvmlInit()
#     handle = nvmlDeviceGetHandleByIndex(int(device))
#     info = nvmlDeviceGetMemoryInfo(handle)
#     print(f"GPU memory occupied: {info.used//1024**2} MB.")


# ___________________________________________________________________________________________________________________________
# In[]: preparing the dataset ***********************************************************************************************
dataset_obj = Dataset_Preprocessing(data_path, dataset_batch_size=hp.tokenizer_batch_size, max_seq_length=hp.max_seq_length)
print("Loading tokenizer")
# tokenizer = dataset_obj.load_tokenizer(tok_type="mistral_src", tokenizer_path=tokenizer_path_sentence_piece_for_mistral_src)
#-----------------------------------------------------------------------------------------------------------------------------
# if not os.path.exists(tokenizer_path_hf_debertv2):
#     tokenizer_deberta = DebertaV2Tokenizer(
#         vocab_file  = '/home/dosisiddhesh/MISTRAL_EXP/model/tokenizer_5.0%_50000_new.model',
#         # max_len = 512,
#     )
#     tokenizer_deberta.save_pretrained(tokenizer_path)
# tokenizer = dataset_obj.load_tokenizer(tok_type="debertaV2", tokenizer_path=tokenizer_path_hf_debertv2)
#-----------------------------------------------------------------------------------------------------------------------------
tokenizer = dataset_obj.load_tokenizer(tok_type="hf", tokenizer_path=tokenizer_path_hf_our)
# tokenizer.add_special_tokens({'pad_token': '[PAD]',
#                               'unk_token': '[UNK]',
#                               'mask_token': '[MASK]',
#                               'cls_token': '[CLS]',
#                               'sep_token': '[SEP]',
#                               'bos_token': '[BOS]',
#                               'eos_token': '[EOS]',
#                               })

# print("Tokenizer special tokens:", tokenizer.special_tokens_map)
# print("Tokenizer vocab size:", tokenizer.vocab_size)
# print("Tokenizer bos token:", tokenizer.bos_token)
# print("Tokenizer eos token:", tokenizer.eos_token)
# print("Tokenizer pad token:", tokenizer.pad_token)
# print("Tokenizer unk token:", tokenizer.unk_token)
# print("Tokenizer mask token:", tokenizer.mask_token)
# print("Tokenizer cls token:", tokenizer.cls_token)
# print("Tokenizer sep token:", tokenizer.sep_token)
# print("Tokenizer pad token id:", tokenizer.pad_token_id)
# print("Tokenizer unk token id:", tokenizer.unk_token_id)
# print("Tokenizer mask token id:", tokenizer.mask_token_id)
# print("Tokenizer cls token id:", tokenizer.cls_token_id)
# print("Tokenizer sep token id:", tokenizer.sep_token_id)
# print("Tokenizer bos token id:", tokenizer.bos_token_id)
# print("Tokenizer eos token id:", tokenizer.eos_token_id)
# print("Tokenizer vocab size:", tokenizer.vocab_size)
# decode the token -100 
# input("Press Enter to continue...")
#-----------------------------------------------------------------------------------------------------------------------------
print("Loading and preparing dataset...")
dataset_obj.generate_dataset(rows=data_row, eval_frac=hp.eval_frac)


2023-12-30 18:19:22.902259: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-30 18:19:22.942774: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-30 18:19:22.942811: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-30 18:19:22.944233: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-30 18:19:22.951425: I tensorflow/core/platform/cpu_feature_guar

Loading tokenizer
Loading and preparing dataset...
loading sample dataset of size  100
size of dataframe in MB:  6.497421
Train dataset size:  85
Validation dataset size:  10
Train dataset columns:  Index(['text'], dtype='object')
Validation dataset columns:  Index(['text'], dtype='object')


Map (num_proc=8):   0%|          | 0/85 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=8):   0%|          | 0/10 [00:00<?, ? examples/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


In [2]:
print("Loading model...")
model_obj = MyModel(model_id=hp.model_id, hp=hp)
config = model_obj.get_model_config(param)    # huggingface mistral config
model = model_obj.get_model(param).to("cuda:0", dtype= torch.float32)
print("Total Params:",model_obj.model_size_and_parameters())
print("Original Model Size:",model.dtype)


Loading model...
MISTRAL model size: 929.7M parameters
+------------------------------------------------+------------+
|                    Modules                     | Parameters |
+------------------------------------------------+------------+
|           model.embed_tokens.weight            | 204800000  |
|     model.layers.0.self_attn.q_proj.weight     |  16777216  |
|     model.layers.0.self_attn.k_proj.weight     |  4194304   |
|     model.layers.0.self_attn.v_proj.weight     |  4194304   |
|     model.layers.0.self_attn.o_proj.weight     |  16777216  |
|      model.layers.0.mlp.gate_proj.weight       |  29360128  |
|       model.layers.0.mlp.up_proj.weight        |  29360128  |
|      model.layers.0.mlp.down_proj.weight       |  29360128  |
|     model.layers.0.input_layernorm.weight      |    4096    |
| model.layers.0.post_attention_layernorm.weight |    4096    |
|     model.layers.1.self_attn.q_proj.weight     |  16777216  |
|     model.layers.1.self_attn.k_proj.weight     

In [3]:
model_parallel = torch.nn.DataParallel(model, device_ids=[0,1,2,3])


In [4]:
model_parallel.device_ids

[0, 1, 2, 3]

: 