In [1]:
import torch
import multiprocessing
import os

# Set multiprocessing start method for CUDA compatibility
multiprocessing.set_start_method("spawn", force=True)

# Set CUDA_VISIBLE_DEVICES for specific GPUs if needed
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Use GPUs 0 and 1

def device_count():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

device_count()

# Check GPU memory before starting
print("Initial GPU memory usage:")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i} memory allocated: {torch.cuda.memory_allocated(i) / 1e6} MB")
    print(f"GPU {i} memory reserved: {torch.cuda.memory_reserved(i) / 1e6} MB")

# Clear CUDA cache
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

Number of GPUs available: 2
GPU 0: NVIDIA GeForce RTX 4090
GPU 1: NVIDIA GeForce RTX 4090
Initial GPU memory usage:
GPU 0 memory allocated: 0.0 MB
GPU 0 memory reserved: 0.0 MB
GPU 1 memory allocated: 0.0 MB
GPU 1 memory reserved: 0.0 MB


In [2]:
from accelerate import PartialState

def get_device():
    device_str = PartialState().process_index
    print(f"Using device: {device_str}")
    return device_str

device_str = get_device()

Using device: 0


In [3]:
hf_token = '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'

In [4]:
#from huggingface_hub import notebook_login
#notebook_login()

In [5]:
import torch
from peft import LoraConfig
from transformers import BitsAndBytesConfig

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

modelName = "google/gemma-2-2b"

tokenizer = AutoTokenizer.from_pretrained(modelName, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(modelName, 
                                             quantization_config=bnb_config, 
                                             device_map="auto",
                                             #device_map={"":device_str}, 
                                             token=hf_token)

#print(f"Device map for training: {model.device_map}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
print("Device map of model layers:")
for name, param in model.named_parameters():
    print(f"{name} is on {param.device}")


Device map of model layers:
model.embed_tokens.weight is on cuda:0
model.layers.0.self_attn.q_proj.weight is on cuda:0
model.layers.0.self_attn.k_proj.weight is on cuda:0
model.layers.0.self_attn.v_proj.weight is on cuda:0
model.layers.0.self_attn.o_proj.weight is on cuda:0
model.layers.0.mlp.gate_proj.weight is on cuda:0
model.layers.0.mlp.up_proj.weight is on cuda:0
model.layers.0.mlp.down_proj.weight is on cuda:0
model.layers.0.input_layernorm.weight is on cuda:0
model.layers.0.pre_feedforward_layernorm.weight is on cuda:0
model.layers.0.post_feedforward_layernorm.weight is on cuda:0
model.layers.0.post_attention_layernorm.weight is on cuda:0
model.layers.1.self_attn.q_proj.weight is on cuda:0
model.layers.1.self_attn.k_proj.weight is on cuda:0
model.layers.1.self_attn.v_proj.weight is on cuda:0
model.layers.1.self_attn.o_proj.weight is on cuda:0
model.layers.1.mlp.gate_proj.weight is on cuda:0
model.layers.1.mlp.up_proj.weight is on cuda:0
model.layers.1.mlp.down_proj.weight is on 

In [8]:
from datasets import load_dataset
dataset = load_dataset("myzens/alpaca-turkish-combined", split="train")
dataset, dataset[0]

(Dataset({
     features: ['input', 'output', 'instruction'],
     num_rows: 82353
 }),
 {'input': '',
  'output': "Fransa'nın başkenti Paris'tir.",
  'instruction': "Fransa'nın başkenti nedir?"})

In [9]:
gemma_prompt = """<start_of_turn>user
{}: {}<end_of_turn>
<start_of_turn>model
{}<end_of_turn>"""
gemma_prompt

'<start_of_turn>user\n{}: {}<end_of_turn>\n<start_of_turn>model\n{}<end_of_turn>'

In [10]:
eos_token = tokenizer.eos_token
pad_token = tokenizer.pad_token
tokenizer.padding_side = "right"

eos_token, pad_token

('<eos>', '<pad>')

In [11]:
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = gemma_prompt.format(instruction, input, output) + eos_token
        texts.append(text)
    return { "text" : texts, }
pass

In [12]:
dataset = dataset.map(formatting_prompts_func, batched = True)
dataset

Dataset({
    features: ['input', 'output', 'instruction', 'text'],
    num_rows: 82353
})

In [13]:
print(dataset["text"][2])

<start_of_turn>user
Tek farklı olanı belirleyin.: Twitter, Instagram, Telegram<end_of_turn>
<start_of_turn>model
Telegram<end_of_turn><eos>


In [14]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )
    # Labels are identical to input_ids for causal language modeling
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

print("Tokenizing dataset...")
dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print("Dataset tokenized:", dataset[0])

Tokenizing dataset...
Dataset tokenized: {'input': '', 'output': "Fransa'nın başkenti Paris'tir.", 'instruction': "Fransa'nın başkenti nedir?", 'input_ids': [2, 106, 1645, 108, 21727, 29541, 235303, 68749, 20074, 235273, 1077, 91278, 7846, 235248, 107, 108, 106, 2516, 108, 21727, 29541, 235303, 68749, 20074, 235273, 1077, 7127, 235303, 6651, 235265, 107, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [15]:
from transformers import TrainingArguments

train_args = TrainingArguments(
    per_device_train_batch_size=4,  # Lowered batch size for 2B model in Jupyter
    gradient_accumulation_steps=4,  # Higher accumulation to simulate larger batch size
    warmup_steps=30,
    max_steps=500,
    #num_train_epochs=3,
    gradient_checkpointing=True,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    output_dir="outputs",    
    report_to="none",
    ddp_find_unused_parameters=False,  # For DDP compatibility
)

In [16]:
from transformers import DataCollatorForSeq2Seq
from trl import SFTTrainer

# Define a data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
    return_tensors="pt"
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=train_args,
    peft_config=lora_config,
    train_dataset=dataset,
    data_collator=data_collator,
    max_seq_length=1024,  # Adjusted for efficiency
    packing=False
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


[2024-12-15 16:25:11,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/500 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


{'loss': 14.5238, 'grad_norm': 20.342395782470703, 'learning_rate': 2.3333333333333336e-05, 'epoch': 0.0}
{'loss': 4.268, 'grad_norm': 7.5723795890808105, 'learning_rate': 5.666666666666667e-05, 'epoch': 0.0}
{'loss': 3.0381, 'grad_norm': 5.141354560852051, 'learning_rate': 9e-05, 'epoch': 0.01}
{'loss': 1.5589, 'grad_norm': 27.26141357421875, 'learning_rate': 9.851063829787235e-05, 'epoch': 0.01}
{'loss': 1.7459, 'grad_norm': 17.041549682617188, 'learning_rate': 9.638297872340426e-05, 'epoch': 0.01}
{'loss': 0.7791, 'grad_norm': 5.4085283279418945, 'learning_rate': 9.425531914893617e-05, 'epoch': 0.01}
{'loss': 0.6331, 'grad_norm': 3.4994218349456787, 'learning_rate': 9.212765957446809e-05, 'epoch': 0.01}
{'loss': 0.5889, 'grad_norm': 1.4355807304382324, 'learning_rate': 9e-05, 'epoch': 0.02}
{'loss': 0.5901, 'grad_norm': 10.499778747558594, 'learning_rate': 8.787234042553192e-05, 'epoch': 0.02}
{'loss': 0.5388, 'grad_norm': 5.884941101074219, 'learning_rate': 8.574468085106383e-05, '

TrainOutput(global_step=500, training_loss=0.9622677526473999, metrics={'train_runtime': 2815.9801, 'train_samples_per_second': 2.841, 'train_steps_per_second': 0.178, 'total_flos': 1.01550554873856e+17, 'train_loss': 0.9622677526473999, 'epoch': 0.09713924911360435})

In [18]:
trainer.save_model("gemma-2-2b-tr")

In [20]:
device = "cuda:0"

In [27]:
questions = [
    "<start_of_turn>user Sorunun cevabını doğru şekilde açıklar mısın?: Bir elmanın yarısı kaç eder?<end_of_turn><start_of_turn>model ",
    "<start_of_turn>user Bir metni İngilizceye çevir: Bugün hava çok güzel.<end_of_turn><start_of_turn>model ",
    "<start_of_turn>user Kendini tanıt ve ardından bana Türkçe öğrenmek için önerilerde bulun.<end_of_turn><start_of_turn>model ",
]

for question in questions:
    # Tokenize the question
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Convert only floating-point tensors (like attention_mask) to FP16
    inputs = {key: value.half() if value.dtype == torch.float else value for key, value in inputs.items()}
    
    # Generate response
    outputs = model.generate(**inputs, max_new_tokens=64)
    
    # Decode and print the output
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))


  return fn(*args, **kwargs)


RuntimeError: Index put requires the source and destination dtypes match, got Half for the destination and Float for the source.