In [4]:
pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


model llm architecture

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Qwen2Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(Qwen2Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)

class Qwen2SdpaAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(Qwen2SdpaAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        return attn_output

class Qwen2MLP(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super(Qwen2MLP, self).__init__()
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, embed_dim)

    def forward(self, x):
        x = F.gelu(self.fc1(x))
        x = self.fc2(x)
        return x

class Qwen2RMSNorm(nn.Module):
    def __init__(self, embed_dim):
        super(Qwen2RMSNorm, self).__init__()
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        return self.norm(x)

class Qwen2DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_hidden_dim):
        super(Qwen2DecoderLayer, self).__init__()
        self.attention = Qwen2SdpaAttention(embed_dim, num_heads)
        self.mlp = Qwen2MLP(embed_dim, mlp_hidden_dim)
        self.norm1 = Qwen2RMSNorm(embed_dim)
        self.norm2 = Qwen2RMSNorm(embed_dim)

    def forward(self, x):
        x = x + self.attention(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x

class Qwen2Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, mlp_hidden_dim, num_layers):
        super(Qwen2Model, self).__init__()
        self.embedding = Qwen2Embedding(vocab_size, embed_dim)
        self.layers = nn.ModuleList([Qwen2DecoderLayer(embed_dim, num_heads, mlp_hidden_dim) for _ in range(num_layers)])
        self.norm = Qwen2RMSNorm(embed_dim)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        return x

class Qwen2ForCausalLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, mlp_hidden_dim, num_layers):
        super(Qwen2ForCausalLM, self).__init__()
        self.model = Qwen2Model(vocab_size, embed_dim, num_heads, mlp_hidden_dim, num_layers)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, input_ids):
        x = self.model(input_ids)
        logits = self.lm_head(x)
        return logits

# Hyperparameters (these should match those in the provided summary)
vocab_size = 50257
embed_dim = 768
num_heads = 12
mlp_hidden_dim = 3072
num_layers = 28

# Instantiate the model
model = Qwen2ForCausalLM(vocab_size, embed_dim, num_heads, mlp_hidden_dim, num_layers)




prep dataset

In [18]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# Define a simple custom dataset
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, vocab):
        self.texts = texts
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        token_ids = [self.vocab[token] for token in tokens]
        return torch.tensor(token_ids, dtype=torch.long)

# Sample texts for training
texts = [
    "Hello, how are you?",
    "I am a language model.",
    "This is a simple example.",
    "hello haris hota.",
    "haris hota lives in germany.",
    "PyTorch makes it easy to build models."
]

tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(map(tokenizer, texts), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Create the dataset
dataset = TextDataset(texts, tokenizer, vocab)


In [19]:
def collate_batch(batch):
    batch = [item for item in batch if len(item) > 0]
    batch = pad_sequence(batch, padding_value=vocab["<pad>"], batch_first=True)
    return batch, batch

batch_size = 2
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)


In [20]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 10
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids, target_ids = batch
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids)

        # Shift targets to align with outputs
        shift_logits = outputs[..., :-1, :].contiguous()
        shift_labels = target_ids[..., 1:].contiguous()

        # Compute loss
        loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")


Epoch 1, Loss: 10.9488
Epoch 2, Loss: 5.4977
Epoch 3, Loss: 3.9952
Epoch 4, Loss: 3.4679
Epoch 5, Loss: 2.3098
Epoch 6, Loss: 1.6067
Epoch 7, Loss: 1.1605
Epoch 8, Loss: 0.8305
Epoch 9, Loss: 0.6417
Epoch 10, Loss: 0.4773


generate response

In [21]:
import torch.nn.functional as F

def greedy_decode(model, input_ids, max_length, vocab):
    model.eval()
    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            next_token_logits = outputs[:, -1, :]
            next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
            input_ids = torch.cat([input_ids, next_token_id], dim=-1)
            if next_token_id == vocab["<pad>"]:
                break
    return input_ids


In [22]:
def decode_tokens(token_ids, vocab):
    reverse_vocab = {v: k for k, v in vocab.get_stoi().items()}
    return ' '.join([reverse_vocab[token_id.item()] for token_id in token_ids if token_id.item() != vocab["<pad>"]])


In [23]:
# Example input text
input_text = "Hello, how are you?"

# Tokenize and convert to token IDs
input_tokens = tokenizer(input_text)
input_ids = torch.tensor([vocab[token] for token in input_tokens], dtype=torch.long).unsqueeze(0)

# Generate response
max_length = 50  # Adjust as needed
generated_ids = greedy_decode(model, input_ids, max_length, vocab)

# Decode generated token IDs back to text
response = decode_tokens(generated_ids[0], vocab)
print("Response:", response)


Response: hello , how are you ? it easy to build models . you ? it easy to build models . you ? it easy to build models . you ? it easy to build models . you ? it easy to build models . you ? it easy to build models . you ? it easy


In [24]:
# Example input text
input_text = "haris hota lives"

# Tokenize and convert to token IDs
input_tokens = tokenizer(input_text)
input_ids = torch.tensor([vocab[token] for token in input_tokens], dtype=torch.long).unsqueeze(0)

# Generate response
max_length = 50  # Adjust as needed
generated_ids = greedy_decode(model, input_ids, max_length, vocab)

# Decode generated token IDs back to text
response = decode_tokens(generated_ids[0], vocab)
print("Response:", response)


Response: haris hota lives in germany . you ? it easy to build models . you ? it easy to build models . you ? it easy to build models . you ? it easy to build models . you ? it easy to build models . you ? it easy to build models


load qen model to get summary

In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-1.5B-Instruct")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [1]:
pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/


In [2]:
pip install accelerate



In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("harry85/finetuned-TinyLLAMA-own-data-07")
model = AutoModelForCausalLM.from_pretrained("harry85/finetuned-TinyLLAMA-own-data-07")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Unused kwargs: ['quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/101M [00:00<?, ?B/s]

Some weights of the model checkpoint at harry85/finetuned-TinyLLAMA-own-data-07 were not used when initializing LlamaForCausalLM: ['base_model.model.model.layers.0.mlp.down_proj.lora_A.weight', 'base_model.model.model.layers.0.mlp.down_proj.lora_B.weight', 'base_model.model.model.layers.0.mlp.gate_proj.lora_A.weight', 'base_model.model.model.layers.0.mlp.gate_proj.lora_B.weight', 'base_model.model.model.layers.0.mlp.up_proj.lora_A.weight', 'base_model.model.model.layers.0.mlp.up_proj.lora_B.weight', 'base_model.model.model.layers.0.self_attn.k_proj.lora_A.weight', 'base_model.model.model.layers.0.self_attn.k_proj.lora_B.weight', 'base_model.model.model.layers.0.self_attn.o_proj.lora_A.weight', 'base_model.model.model.layers.0.self_attn.o_proj.lora_B.weight', 'base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight', 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight', 'base_model.model.model.layers.0.self_attn.v_proj.lora_A.weight', 'base_model.model.model.layers.0.

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [5]:
# Print the model summary layer by layer
from torchinfo import summary
print(summary(model, depth=2))

Layer (type:depth-idx)                                  Param #
Qwen2ForCausalLM                                        --
├─Qwen2Model: 1-1                                       --
│    └─Embedding: 2-1                                   233,373,696
│    └─ModuleList: 2-2                                  1,310,339,072
│    └─Qwen2RMSNorm: 2-3                                1,536
├─Linear: 1-2                                           233,373,696
Total params: 1,777,088,000
Trainable params: 1,777,088,000
Non-trainable params: 0


In [6]:
print(summary(model, depth=3))


Layer (type:depth-idx)                                  Param #
Qwen2ForCausalLM                                        --
├─Qwen2Model: 1-1                                       --
│    └─Embedding: 2-1                                   233,373,696
│    └─ModuleList: 2-2                                  --
│    │    └─Qwen2DecoderLayer: 3-1                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-2                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-3                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-4                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-5                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-6                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-7                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-8                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-9                      46,797,824
│    │    └─Qwen2DecoderLayer: 3-10                     46,797,824
│    │    └─Qwen2Deco

In [7]:
print(summary(model, depth=4))

Layer (type:depth-idx)                                  Param #
Qwen2ForCausalLM                                        --
├─Qwen2Model: 1-1                                       --
│    └─Embedding: 2-1                                   233,373,696
│    └─ModuleList: 2-2                                  --
│    │    └─Qwen2DecoderLayer: 3-1                      --
│    │    │    └─Qwen2SdpaAttention: 4-1                5,507,072
│    │    │    └─Qwen2MLP: 4-2                          41,287,680
│    │    │    └─Qwen2RMSNorm: 4-3                      1,536
│    │    │    └─Qwen2RMSNorm: 4-4                      1,536
│    │    └─Qwen2DecoderLayer: 3-2                      --
│    │    │    └─Qwen2SdpaAttention: 4-5                5,507,072
│    │    │    └─Qwen2MLP: 4-6                          41,287,680
│    │    │    └─Qwen2RMSNorm: 4-7                      1,536
│    │    │    └─Qwen2RMSNorm: 4-8                      1,536
│    │    └─Qwen2DecoderLayer: 3-3                      --


In [8]:
print(summary(model, depth=4))

Layer (type:depth-idx)                                  Param #
Qwen2ForCausalLM                                        --
├─Qwen2Model: 1-1                                       --
│    └─Embedding: 2-1                                   233,373,696
│    └─ModuleList: 2-2                                  --
│    │    └─Qwen2DecoderLayer: 3-1                      --
│    │    │    └─Qwen2SdpaAttention: 4-1                5,507,072
│    │    │    └─Qwen2MLP: 4-2                          41,287,680
│    │    │    └─Qwen2RMSNorm: 4-3                      1,536
│    │    │    └─Qwen2RMSNorm: 4-4                      1,536
│    │    └─Qwen2DecoderLayer: 3-2                      --
│    │    │    └─Qwen2SdpaAttention: 4-5                5,507,072
│    │    │    └─Qwen2MLP: 4-6                          41,287,680
│    │    │    └─Qwen2RMSNorm: 4-7                      1,536
│    │    │    └─Qwen2RMSNorm: 4-8                      1,536
│    │    └─Qwen2DecoderLayer: 3-3                      --
