In [1]:
import qwen
from lora_skeleton import process_sequences, LoRALinear
import sys

from preprocessor import load_and_preprocess

import math
import sys
sys.path.append('./src') 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from accelerate import Accelerator
import wandb

from preprocessor import load_and_preprocess
from qwen import load_qwen
import wandb

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [2]:
model , tokenizer = qwen.load_qwen()


text = 'What is the capital of Italy?'
tokenised_text = process_sequences([text], tokenizer)
# print(tokenised_text)

inputs = tokenizer.encode(text, return_tensors="pt")

predicitons = model.generate(inputs)

print(tokenizer.decode(predicitons[0], skip_special_tokens=True))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


What is the capital of Italy? The capital of Italy is Rome. It is the largest city in Europe and the second-largest city in


In [3]:
model, tokenizer = qwen.load_qwen()
lora_rank = 4

# Actually apply LoRA to the model:
for layer in model.model.layers:
    layer.self_attn.q_proj = LoRALinear(layer.self_attn.q_proj, r=lora_rank)
    layer.self_attn.v_proj = LoRALinear(layer.self_attn.v_proj, r=lora_rank)
# ^These are the parts that will actually be trained!

# Process the data into sequences of text
train_texts, val_texts, norm_factor = load_and_preprocess("lotka_volterra_data.h5")


import numpy as np
print(np.shape(train_texts))
print(np.shape(val_texts))
print(train_texts[0][:60])

(800,)
(200,)
0.28,0.30;0.21,0.23;0.20,0.16;0.21,0.12;0.24,0.09;0.29,0.07;


In [4]:
# max_ctx_length = 512
# train_input_ids = process_sequences(
#     train_texts, tokenizer, max_ctx_length, stride=max_ctx_length // 2
# )
# val_input_ids = process_sequences(
#     val_texts, tokenizer, max_ctx_length, stride=max_ctx_length
# )

# batch_size = 4
# learning_rate = 1e-5

# optimizer = torch.optim.Adam(
#     (p for p in model.parameters() if p.requires_grad), lr=learning_rate
# )
# train_dataset = TensorDataset(train_input_ids)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


# # Prepare components with Accelerator
# accelerator = Accelerator()
# model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

# model.train()
# steps = 0
# while steps < 10000:
#     progress_bar = tqdm(train_loader, desc=f"Steps {steps}")
#     for (batch,) in progress_bar:
#         optimizer.zero_grad()
#         outputs = model(batch, labels=batch)
#         loss = outputs.loss
#         accelerator.backward(loss)
#         optimizer.step()
#         steps += 1

#         progress_bar.set_postfix(loss=loss.item())
#         if steps > 10000:
#             break

# model.eval()


In [5]:
# import wandb
# from torch.utils.data import DataLoader, TensorDataset
# from tqdm import tqdm
# import torch
# from accelerate import Accelerator


# batch_size = 4
# learning_rate = 1e-5
# max_ctx_length = 512


# # Initialize wandb project
# wandb.init(project="your_project_name", config={
#     "batch_size": batch_size,
#     "learning_rate": learning_rate,
#     "max_ctx_length": max_ctx_length
# })

# # Prepare the data
# train_input_ids = process_sequences(train_texts, tokenizer, max_ctx_length, stride=max_ctx_length // 2)
# val_input_ids = process_sequences(val_texts, tokenizer, max_ctx_length, stride=max_ctx_length)


# optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), lr=learning_rate)

# train_dataset = TensorDataset(train_input_ids)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# # Prepare components with Accelerator
# accelerator = Accelerator()
# model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

# # Set the model to training mode
# model.train()

# steps = 0
# while steps < 10000:
#     progress_bar = tqdm(train_loader, desc=f"Steps {steps}")
#     for (batch,) in progress_bar:
#         optimizer.zero_grad()
        
#         # Forward pass
#         outputs = model(batch, labels=batch)
#         loss = outputs.loss
        
#         # Backward pass
#         accelerator.backward(loss)
        
#         # Update the model weights
#         optimizer.step()
        
#         # Log the loss to WandB
#         wandb.log({"loss": loss.item(), "steps": steps})
        
#         steps += 1
#         progress_bar.set_postfix(loss=loss.item())
        
#         if steps > 10000:
#             break

# # Evaluate the model
# model.eval()

# # Close the wandb run
# wandb.finish()


In [None]:
batch_size = 2
learning_rate = 1e-5
max_ctx_length = 512


# Initialize wandb project
wandb.init(project="LORA_initital", config={
    "batch_size": batch_size,
    "learning_rate": learning_rate,
    "max_ctx_length": 512
})

# Defines the maximum context length
torch.mps.empty_cache() 
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print('Device:', device)

train_input_ids = process_sequences(
    train_texts, tokenizer, max_ctx_length, stride=max_ctx_length // 2
)
val_input_ids = process_sequences(
    val_texts, tokenizer, max_ctx_length, stride=max_ctx_length
)

optimizer = torch.optim.Adam(
    (p for p in model.parameters() if p.requires_grad), lr=learning_rate
)
train_dataset = TensorDataset(train_input_ids)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


# Prepare components with Accelerator
accelerator = Accelerator()
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

model.to(device)

model.train()
steps = 0
while steps < 10000:
    progress_bar = tqdm(train_loader, desc=f"Steps {steps}")
    for (batch,) in progress_bar:
        optimizer.zero_grad()
        outputs = model(batch.to(device), labels=batch.to(device))
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        wandb.log({"loss": loss.item(), "steps": steps})
        steps += 1

        progress_bar.set_postfix(loss=loss.item())
        if steps > 10000:
            break

model.eval()

wandb.finish()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Current

Device: mps


Steps 0:   0%|          | 0/1600 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Steps 0:  13%|█▎        | 215/1600 [06:45<39:50,  1.73s/it, loss=0.736]  

In [1]:
import torch

# Check if the model is on GPU
if next(model.parameters()).is_cuda:
    print("Model is on GPU")
else:
    print("Model is on CPU")


NameError: name 'model' is not defined

NameError: name 'original_linear' is not defined