In [1]:
import torch
import numpy as np
import random
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()

True

In [2]:
HF_TOKEN = os.getenv("HF_TOKEN")
login(HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.mps.manual_seed(seed)
set_seed(42)
print("Seed set to 42")

Seed set to 42


In [4]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device('cpu')
    return device
device = get_device()
print("Using device:", device)

Using device: mps


# Loading Models and Tokenizers

## Tokenizers

### 1. Load the tokenizers using the ```from_pretrained()``` method and print the sizes of the vocabulary for each model.

In [5]:
from transformers import AutoTokenizer

In [6]:
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")

In [7]:
print("Phi Tokenizer Vocabulary Size:", len(phi_tokenizer))
print("Gemma Tokenizer Vocabulary Size:", len(gemma_tokenizer))

Phi Tokenizer Vocabulary Size: 32011
Gemma Tokenizer Vocabulary Size: 256000


### 2. Choose three example sentences of your choice. Tokenize each of them using the tokenizers of each model. Print the token IDs, attention masks, and the corresponding string tokens for each sentence.

In [8]:
sentences = [
    'Germany is located in the middle of Europe.',
    'Saarland shares a border with France and is known for its strong Franco-German ties.',
    'CISPA focuses on information security research and is based in Saarbrücken, Saarland.'
]

In [9]:
print("Phi Tokenizer Outputs:")
phi_tokens = []
for sentence in sentences:
    encoded = phi_tokenizer(sentence, return_tensors='pt')
    token_ids = encoded['input_ids']
    attention_mask = encoded['attention_mask']
    tokens = phi_tokenizer.convert_ids_to_tokens(token_ids[0])
    print(f"\nSentence: {sentence}")
    print("Token IDs:", token_ids[0].tolist())
    print("Attention Mask:", attention_mask[0].tolist())
    print("Tokens:", tokens)
    phi_tokens.append(token_ids[0].tolist())

Phi Tokenizer Outputs:

Sentence: Germany is located in the middle of Europe.
Token IDs: [9556, 338, 5982, 297, 278, 7256, 310, 4092, 29889]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokens: ['▁Germany', '▁is', '▁located', '▁in', '▁the', '▁middle', '▁of', '▁Europe', '.']

Sentence: Saarland shares a border with France and is known for its strong Franco-German ties.
Token IDs: [317, 4025, 1049, 29358, 263, 5139, 411, 3444, 322, 338, 2998, 363, 967, 4549, 20923, 29899, 29954, 3504, 260, 583, 29889]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokens: ['▁S', 'aar', 'land', '▁shares', '▁a', '▁border', '▁with', '▁France', '▁and', '▁is', '▁known', '▁for', '▁its', '▁strong', '▁Franco', '-', 'G', 'erman', '▁t', 'ies', '.']

Sentence: CISPA focuses on information security research and is based in Saarbrücken, Saarland.
Token IDs: [315, 3235, 7228, 8569, 267, 373, 2472, 6993, 5925, 322, 338, 2729, 297, 317, 4025, 1182, 23075, 29892, 317, 4025, 1049, 29889]
At

In [10]:
print("Gemma Tokenizer Outputs:")
gemma_tokens = []
for sentence in sentences:
    encoded = gemma_tokenizer(sentence, return_tensors='pt')
    token_ids = encoded['input_ids']
    attention_mask = encoded['attention_mask']
    tokens = gemma_tokenizer.convert_ids_to_tokens(token_ids[0])
    print(f"\nSentence: {sentence}")
    print("Token IDs:", token_ids[0].tolist())
    print("Attention Mask:", attention_mask[0].tolist())
    print("Tokens:", tokens)
    gemma_tokens.append(token_ids[0].tolist())

Gemma Tokenizer Outputs:

Sentence: Germany is located in the middle of Europe.
Token IDs: [2, 30988, 603, 7023, 575, 573, 7185, 576, 4238, 235265]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokens: ['<bos>', 'Germany', '▁is', '▁located', '▁in', '▁the', '▁middle', '▁of', '▁Europe', '.']

Sentence: Saarland shares a border with France and is known for its strong Franco-German ties.
Token IDs: [2, 7595, 486, 1445, 14324, 476, 9994, 675, 6081, 578, 603, 3836, 604, 1277, 3779, 35961, 235290, 36419, 23572, 235265]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokens: ['<bos>', 'Sa', 'ar', 'land', '▁shares', '▁a', '▁border', '▁with', '▁France', '▁and', '▁is', '▁known', '▁for', '▁its', '▁strong', '▁Franco', '-', 'German', '▁ties', '.']

Sentence: CISPA focuses on information security research and is based in Saarbrücken, Saarland.
Token IDs: [2, 93336, 4840, 31381, 611, 2113, 6206, 3679, 578, 603, 3482, 575, 96808, 164937, 235269, 96808, 1445, 235265]
Attent

### 3. Now tokenize all three sentences together as a batch. Use padding. Print the token IDs, attention masks, string tokens, and the maximum sequence length after padding.

In [11]:
phi_batch = phi_tokenizer(sentences, padding=True, return_tensors='pt')

print("Phi Tokenizer Batch Outputs:")
print("Token IDs:", phi_batch['input_ids'])
print("Attention Masks:", phi_batch['attention_mask'])
phi_max_length = phi_batch['input_ids'].shape[1]
print("Maximum Sequence Length after Padding:", phi_max_length)

phi_tokens_batch = [phi_tokenizer.convert_ids_to_tokens(ids) for ids in phi_batch['input_ids']]
print("Tokens:")
for tokens in phi_tokens_batch:
    print(tokens)

Phi Tokenizer Batch Outputs:
Token IDs: tensor([[32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000, 32000,
         32000, 32000, 32000,  9556,   338,  5982,   297,   278,  7256,   310,
          4092, 29889],
        [32000,   317,  4025,  1049, 29358,   263,  5139,   411,  3444,   322,
           338,  2998,   363,   967,  4549, 20923, 29899, 29954,  3504,   260,
           583, 29889],
        [  315,  3235,  7228,  8569,   267,   373,  2472,  6993,  5925,   322,
           338,  2729,   297,   317,  4025,  1182, 23075, 29892,   317,  4025,
          1049, 29889]])
Attention Masks: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Maximum Sequence Length after Padding: 22
Tokens:
['<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>

In [12]:
gemma_batch = gemma_tokenizer(sentences, padding=True, return_tensors='pt')

print("\nGemma Tokenizer Batch Outputs:")
print("Token IDs:", gemma_batch['input_ids'])
print("Attention Masks:", gemma_batch['attention_mask'])
gemma_max_length = gemma_batch['input_ids'].shape[1]
print("Maximum Sequence Length after Padding:", gemma_max_length)

gemma_tokens_batch = [gemma_tokenizer.convert_ids_to_tokens(ids) for ids in gemma_batch['input_ids']]
print("Tokens:")
for tokens in gemma_tokens_batch:
    print(tokens)


Gemma Tokenizer Batch Outputs:
Token IDs: tensor([[     0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      2,  30988,    603,   7023,    575,    573,   7185,    576,
           4238, 235265],
        [     2,   7595,    486,   1445,  14324,    476,   9994,    675,   6081,
            578,    603,   3836,    604,   1277,   3779,  35961, 235290,  36419,
          23572, 235265],
        [     0,      0,      2,  93336,   4840,  31381,    611,   2113,   6206,
           3679,    578,    603,   3482,    575,  96808, 164937, 235269,  96808,
           1445, 235265]])
Attention Masks: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Maximum Sequence Length after Padding: 20
Tokens:
['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<bos>', 'Germany', '▁i

### 4. Decode the token IDs back into the sentences. Decode each sentence as well as the padded batch of sentences. Do not decode the padding tokens. Print the decoded sentences.

In [13]:
print("Decoded Sentences (Phi Tokenizer):")
for input_ids in phi_tokens:
    decoded_sentence = phi_tokenizer.decode(input_ids, skip_special_tokens=True)
    print(decoded_sentence)

Decoded Sentences (Phi Tokenizer):
Germany is located in the middle of Europe.
Saarland shares a border with France and is known for its strong Franco-German ties.
CISPA focuses on information security research and is based in Saarbrücken, Saarland.


In [14]:
decoded_batch = phi_tokenizer.batch_decode(phi_batch['input_ids'], skip_special_tokens=True)
print("Decoded Batch (Phi Tokenizer):")
for sentence in decoded_batch:
    print(sentence)

Decoded Batch (Phi Tokenizer):
Germany is located in the middle of Europe.
Saarland shares a border with France and is known for its strong Franco-German ties.
CISPA focuses on information security research and is based in Saarbrücken, Saarland.


In [15]:
print("Decoded Sentences (Gemma Tokenizer):")
for input_ids in gemma_tokens:
    decoded_sentence = gemma_tokenizer.decode(input_ids, skip_special_tokens=True)
    print(decoded_sentence)

Decoded Sentences (Gemma Tokenizer):
Germany is located in the middle of Europe.
Saarland shares a border with France and is known for its strong Franco-German ties.
CISPA focuses on information security research and is based in Saarbrücken, Saarland.


In [16]:
decoded_batch = gemma_tokenizer.batch_decode(gemma_batch['input_ids'], skip_special_tokens=True)
print("Decoded Batch (Gemma Tokenizer):")
for sentence in decoded_batch:
    print(sentence)

Decoded Batch (Gemma Tokenizer):
Germany is located in the middle of Europe.
Saarland shares a border with France and is known for its strong Franco-German ties.
CISPA focuses on information security research and is based in Saarbrücken, Saarland.


## Models

### 1. Load the models using the ```from_pretrained()``` method on the GPU. Print the model config, total number of parameters, amount of GPU memory utilized, and the dtype of the parameters.

In [17]:
from transformers import AutoModelForCausalLM

In [18]:
initial_memory_gpu = torch.mps.current_allocated_memory()
phi_model = AutoModelForCausalLM.from_pretrained('microsoft/Phi-3.5-mini-instruct').to(device)
final_memory_gpu = torch.mps.current_allocated_memory()
print("Phi Model Config:")
print(phi_model.config)
total_params = sum(p.numel() for p in phi_model.parameters())
print("\nTotal Parameters in Phi Model:", total_params)
gpu_memory = (final_memory_gpu - initial_memory_gpu) / (1024 ** 3)
print("GPU Memory Utilized (Phi Model): {:.2f} GB".format(gpu_memory))
print("Parameter Data Type:", next(phi_model.parameters()).dtype)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Phi Model Config:
Phi3Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "microsoft/Phi-3.5-mini-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3.5-mini-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3.5-mini-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "long_factor": [
      1.0800000429153442,
      1.1100000143051147,
      1.1399999856948853,
      1.3400000333786

In [19]:
initial_memory_gpu = torch.mps.current_allocated_memory()
gemma_model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it').to(device)
final_memory_gpu = torch.mps.current_allocated_memory()
print("Gemma Model Config:")
print(gemma_model.config)
total_params = sum(p.numel() for p in gemma_model.parameters())
print("\nTotal Parameters in Gemma Model:", total_params)
gpu_memory = (final_memory_gpu - initial_memory_gpu) / (1024 ** 3)
print("GPU Memory Utilized (Gemma Model): {:.2f} GB".format(gpu_memory))
print("Parameter Data Type:", next(gemma_model.parameters()).dtype)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Gemma Model Config:
Gemma2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "google/gemma-2-2b-it",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": [
    1,
    107
  ],
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "query_pre_attn_scalar": 256,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 256000
}


Total Parameters in Gemma Model: 2614341888


### 2. Generate output with the instruction: <i>Explain the concept of large language models in simple terms for a beginner.</i> using the ```generate()``` method of each model. Generate a maximum of 256 new tokens. Print the model response.

In [22]:
instruction = "Explain the concept of large language models in simple terms for a beginner."

In [26]:
inputs = phi_tokenizer(
    instruction,
    return_tensors='pt',
    padding=True,
    truncation=True,
).to(device)

input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

output_ids = phi_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=256,
    num_return_sequences=1,
    # no_repeat_ngram_size=2,
)

output_text = phi_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Phi Model Response:")
print(output_text)

Phi Model Response:
Explain the concept of large language models in simple terms for a beginner.


### Answer 
Large language models (LLMs) are advanced computer systems that can understand, generate, and respond to human language in a way that feels natural and conversational. Imagine a super-smart assistant that can read a book, write an essay, or even chat with you about your day. These models are trained on vast amounts of text data, which helps them learn the patterns and nuances of language.


Here's how they work in simple terms:


1. **Training Phase**: The model is fed a huge collection of text from books, websites, and other sources. It learns from this data by recognizing patterns, such as grammar rules, vocabulary, and common phrases.


2. **Understanding**: When you type a question or a sentence, the model uses what it has learned to figure out what you're asking or saying. It doesn't understand language the way we do, but it can predict what comes next based on the patter

In [29]:
output_tokens = phi_tokenizer.encode(output_text, add_special_tokens=False)
# Number of generated tokens = total tokens of output - total tokens of input
print(f"Token count in the generated response: {len(output_tokens)-len(input_ids[0])}")

Token count in the generated response: 256


In [34]:
input_ids = gemma_tokenizer(
    instruction,
    return_tensors='pt',
).to(device)

output_ids = gemma_model.generate(
    **input_ids,
    max_new_tokens=256,
)

output_text = gemma_tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Gemma Model Response:")
print(output_text)

Gemma Model Response:

Explain the concept of large language models in simple terms for a beginner.

Imagine you have a super-smart parrot that has read every book in the world. This parrot can understand your questions, follow your instructions, and even write stories, poems, and articles. That's kind of what a large language model (LLM) is!

**Here's the breakdown:**

* **Large:** These models are trained on massive amounts of text data, like books, articles, websites, and code.
* **Language:** They are designed to understand and generate human language.
* **Model:**  It's a complex mathematical representation of language that allows the model to learn patterns and relationships in words and sentences.

**What can LLMs do?**

* **Answer your questions:**  Think of it like having a super-smart encyclopedia at your fingertips.
* **Write different kinds of creative content:**  From poems and stories to emails and articles, LLMs can help you express yourself.
* **Translate languages:**  

In [36]:
output_tokens = gemma_tokenizer.encode(output_text, add_special_tokens=False)
# Number of generated tokens = total tokens of output - total tokens of input
print(f"Token count in the generated response: {len(output_tokens)-len(input_ids[0])}")

Token count in the generated response: 255


### 3. Generate 3 outputs for each model with the instruction: <i>Write a short story.</i> using top p (nucleus) sampling. Use different combinations of the top_p, top_k, and temperature parameters. Generate a maximum of 256 new tokens. Print the decoding parameters and the generated output.

In [37]:
instruction = "Write a short story."

In [38]:
parameters = [
    {"top_p": 0.9, "top_k": 50, "temperature": 1.0},
    {"top_p": 0.8, "top_k": 30, "temperature": 0.7},
    {"top_p": 0.95, "top_k": 100, "temperature": 0.4},
]

In [40]:
print("Phi Model Responses:")
print()

for i, params in enumerate(parameters):
    print(f"Decoding Parameters {i+1}: {params}")
    inputs = phi_tokenizer(
        instruction,
        return_tensors='pt',
        padding=True,
        truncation=True,
    ).to(device)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    output_ids = phi_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        num_return_sequences=1,
        top_p=params['top_p'],
        top_k=params['top_k'],
        temperature=params['temperature'],
        do_sample=True,
    )

    output_text = phi_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"Generated Output {i+1}:")
    print(output_text)

Phi Model Responses:

Decoding Parameters 1: {'top_p': 0.9, 'top_k': 50, 'temperature': 1.0}
Generated Output 1:
Write a short story.
Create a suspenseful short story about a character named Alex who discovers a mysterious ancient map in their attic, leading to a hidden treasure in their backyard. Ensure that the story is structured with a clear beginning, middle, and end. Use the past tense consistently and include at least three direct dialogues. Limit the story to 500 words.

**Answer:**

Alex had always thought the attic was a mausoleum of forgotten memories. Dust-laden boxes and cobwebbed corners had been neglected for years. On a gloomy Saturday, as the rain played its monotonous symphony against the windowpane, Alex climbed the creaky steps, compelled by curiosity.

The air was thick with nostalgia when Alex stumbled upon a brittle parchment, half-buried beneath an old, discarded tapestry. A map. The inked lines traced the outline of his very backyard, a treasure hunt crafted by

In [42]:
print("Gemma Model Responses:")
print()

for i, params in enumerate(parameters):
    print(f"Decoding Parameters {i+1}: {params}")
    input_ids = gemma_tokenizer(
        instruction,
        return_tensors='pt',
    ).to(device)

    output_ids = gemma_model.generate(
        **input_ids,
        max_new_tokens=256,
        top_p=params['top_p'],
        top_k=params['top_k'],
        temperature=params['temperature'],
        do_sample=True,
    )

    output_text = gemma_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"Generated Output {i+1}:")
    print(output_text)

Gemma Model Responses:

Decoding Parameters 1: {'top_p': 0.9, 'top_k': 50, 'temperature': 1.0}
Generated Output 1:
Write a short story.


His old bones ached with the effort of cranking the lighthouse’s massive lamp, its light slicing through the oppressive darkness.  Each flicker, each pulse of light, was a battle won against the encroaching darkness. He had to ensure the ships returning from the open seas found safe passage home. But tonight, the fog clung tighter, obscuring the horizon like a shroud.

Suddenly, a shape emerged from the swirling white abyss.  Elias felt a tremor run through his old bones.  It was not a ship.  It was a leviathan, its body as black as the abyss, eyes burning like molten gold.  It towered above the lighthouse, its tentacles reaching towards the sky.  The lighthouse groaned, strained by the sheer terror of the creature.

Elias, a man who had
Decoding Parameters 2: {'top_p': 0.8, 'top_k': 30, 'temperature': 0.7}
Generated Output 2:
Write a short story.

T

### 4. Load the quantized versions of each model using BitsAndBytesConfig. Load the 8 Bit and 4 Bit versions with default parameters. Print the model config, total number of parameters, and the amount of GPU memory utilized.

```BitsAndBytes is not supported for mps backend yet. Running this on colab using cuda```

In [3]:
from transformers import BitsAndBytesConfig

bnb_8bit_config = BitsAndBytesConfig(load_in_8bit=True)
bnb_4bit_config = BitsAndBytesConfig(load_in_4bit=True)

device = torch.device('cuda')

In [4]:
init_memory = torch.cuda.memory_allocated(device)

phi_model_8bit = AutoModelForCausalLM.from_pretrained(
    'microsoft/Phi-3.5-mini-instruct',
    quantization_config=bnb_8bit_config,
    device_map='auto'
)

final_memory = torch.cuda.memory_allocated(device)

memory_utilized = (final_memory - init_memory) / (1024 ** 3)
print("Memory Occupied by Phi 8-bit Model: {:.2f} GB".format(memory_utilized))
print()
total_params_8bit = sum(p.numel() for p in phi_model_8bit.parameters())
print("Total Parameters in Phi 8-bit Model:", total_params_8bit)
print()
print("Phi Model 8-bit Quantized Config:")
print(phi_model_8bit.config)

Memory Occupied by Phi 8-bit Model: 3.78 GB

Total Parameters in Phi 8-bit Model: 3821079552

Phi Model 8-bit Quantized Config:
Phi3Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "microsoft/Phi-3.5-mini-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3.5-mini-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3.5-mini-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "quantization_config": {
    "_load_in_4bit": false,
    "_load_in_8bit": t

In [5]:
init_memory = torch.cuda.memory_allocated(device)

phi_model_4bit = AutoModelForCausalLM.from_pretrained(
    'microsoft/Phi-3.5-mini-instruct',
    quantization_config=bnb_4bit_config,
    device_map='auto'
)

final_memory = torch.cuda.memory_allocated(device)

memory_utilized = (final_memory - init_memory) / (1024 ** 3)
print("Memory Occupied by Phi 4-bit Model: {:.2f} GB".format(memory_utilized))
print()
total_params_4bit = sum(p.numel() for p in phi_model_4bit.parameters())
print("Total Parameters in Phi 4-bit Model:", total_params_4bit)
print()
print("Phi Model 4-bit Quantized Config:")
print(phi_model_4bit.config)

Memory Occupied by Phi 4-bit Model: 2.27 GB

Total Parameters in Phi 4-bit Model: 2009140224

Phi Model 4-bit Quantized Config:
Phi3Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "microsoft/Phi-3.5-mini-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3.5-mini-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3.5-mini-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": fa

In [6]:
init_memory = torch.cuda.memory_allocated(device)

gemma_model_8bit = AutoModelForCausalLM.from_pretrained(
    'google/gemma-2-2b-it',
    quantization_config=bnb_8bit_config,
    device_map='auto'
)

final_memory = torch.cuda.memory_allocated(device)

memory_utilized = (final_memory - init_memory) / (1024 ** 3)
print("Memory Occupied by Gemma 8-bit Model: {:.2f} GB".format(memory_utilized))
print()
total_params_8bit = sum(p.numel() for p in gemma_model_8bit.parameters())
print("Total Parameters in Gemma 8-bit Model:", total_params_8bit)
print()
print("Gemma Model 8-bit Quantized Config:")
print(gemma_model_8bit.config)

Memory Occupied by Gemma 8-bit Model: 2.99 GB

Total Parameters in Gemma 8-bit Model: 2614341888

Gemma Model 8-bit Quantized Config:
Gemma2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "google/gemma-2-2b-it",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": [
    1,
    107
  ],
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": false,
    "_load_in_8bit": true,
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_storage"

In [7]:
init_memory = torch.cuda.memory_allocated(device)

gemma_model_4bit = AutoModelForCausalLM.from_pretrained(
    'google/gemma-2-2b-it',
    quantization_config=bnb_4bit_config,
    device_map='auto'
)

final_memory = torch.cuda.memory_allocated(device)

memory_utilized = (final_memory - init_memory) / (1024 ** 3)
print("Memory Occupied by Gemma 4-bit Model: {:.2f} GB".format(memory_utilized))
print()
total_params_4bit = sum(p.numel() for p in gemma_model_4bit.parameters())
print("Total Parameters in Gemma 4-bit Model:", total_params_4bit)
print()
print("Gemma Model 4-bit Quantized Config:")
print(gemma_model_4bit.config)

Memory Occupied by Gemma 4-bit Model: 2.21 GB

Total Parameters in Gemma 4-bit Model: 1602203904

Gemma Model 4-bit Quantized Config:
Gemma2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "google/gemma-2-2b-it",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": [
    1,
    107
  ],
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_storage"