In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece bitsandbytes

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb, platform, gradio, warnings
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login

In [None]:
def print_system_specs():
    # Check if CUDA is available
    is_cuda_available = torch.cuda.is_available()
    print("CUDA Available:", is_cuda_available)
# Get the number of available CUDA devices
    num_cuda_devices = torch.cuda.device_count()
    print("Number of CUDA devices:", num_cuda_devices)
    if is_cuda_available:
        for i in range(num_cuda_devices):
            # Get CUDA device properties
            device = torch.device('cuda', i)
            print(f"--- CUDA Device {i} ---")
            print("Name:", torch.cuda.get_device_name(i))
            print("Compute Capability:", torch.cuda.get_device_capability(i))
            print("Total Memory:", torch.cuda.get_device_properties(i).total_memory, "bytes")
    # Get CPU information
    print("--- CPU Information ---")
    print("Processor:", platform.processor())
    print("System:", platform.system(), platform.release())
    print("Python Version:", platform.python_version())
print_system_specs()

In [None]:
model_name = "meta-llama/Llama-2-7b-hf"

# Dataset name
dataset_name = "AmanMussa/kazakh-instruction-v1"

# Hugging face repository link to save fine-tuned model(Create new repository in huggingface,copy and paste here)
new_model = "https://huggingface.co/AmanMussa/llama2-kaz"

In [None]:
notebook_login()

AttributeError: ignored

In [None]:
!pip install -U ipywidgets

Defaulting to user installation because normal site-packages is not writeable
Collecting ipywidgets
  Downloading ipywidgets-8.1.1-py3-none-any.whl (139 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.4/139.4 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting jupyterlab-widgets~=3.0.9
  Downloading jupyterlab_widgets-3.0.9-py3-none-any.whl (214 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.9/214.9 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting widgetsnbextension~=4.0.9
  Downloading widgetsnbextension-4.0.9-py3-none-any.whl (2.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.1 jupyterlab-widgets-3.0.9 widgetsnbextension-4.0.9


In [None]:
dataset = load_dataset(dataset_name, split="train")
dataset["text"][0]

Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nДені сау болу үшін үш кеңес беріңіз.\n\n### Output:\n1. Баланстық диетаны және жемістер мен көкөністердің көп мөлшерін қосқаныңызға көз жеткізіңіз.\n2. Денеңізді белсенді және күшті ұстау үшін үнемі жаттығу жасаңыз.\n3. Ұйқыға жетіп, ұйқы бойынша тұрақты кесте жасаңыз.'

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

(True, True)

In [None]:
peft_config = LoraConfig(
    lora_alpha= 8,
    lora_dropout= 0.1,
    r= 16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"]
)

In [None]:
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 1,
    per_device_train_batch_size= 8,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 1000,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "linear",
    report_to="wandb",
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/52201 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
trainer.train("drive/MyDrive/checkpoint-2000")


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
2010,1.1901
2040,1.0367
2070,1.0505
2100,0.9427
2130,1.142
2160,0.9632
2190,1.0577
2220,1.0353
2250,0.9312
2280,1.114


Step,Training Loss
2010,1.1901
2040,1.0367
2070,1.0505
2100,0.9427
2130,1.142
2160,0.9632
2190,1.0577
2220,1.0353
2250,0.9312
2280,1.114




TrainOutput(global_step=3263, training_loss=0.3828128472157793, metrics={'train_runtime': 13086.8735, 'train_samples_per_second': 3.989, 'train_steps_per_second': 0.249, 'total_flos': 5.1749473473500774e+17, 'train_loss': 0.3828128472157793, 'epoch': 1.0})

In [None]:
trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()

VBox(children=(Label(value='0.002 MB of 0.021 MB uploaded\r'), FloatProgress(value=0.10323052988011348, max=1.…

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁
train/loss,█▅▅▂▇▃▅▄▂▆▂▅▄▂▆▃▄▄▂▆▄▄▁▅▂▄▃▁▅▂▄▃▁▆▂▃▃▁▅▃
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,1.0
train/global_step,3263.0
train/learning_rate,0.0
train/loss,0.975
train/total_flos,5.174947347350077e+17
train/train_loss,0.38281
train/train_runtime,13086.8735
train/train_samples_per_second,3.989
train/train_steps_per_second,0.249


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
   

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name, low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.float16,
    device_map= {"": 0})
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
new_model="AmanMussa/llama2-kazakh-7b"
model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AmanMussa/llama2-kazakh-7b/commit/869a0cdae4c8b9faa52e75e2910c85435ea2e2ae', commit_message='Upload tokenizer', commit_description='', oid='869a0cdae4c8b9faa52e75e2910c85435ea2e2ae', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
    B_INST, E_INST = "### Instruction:\n", "### Response:\n"

    prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n\n{E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500)

In [None]:
stream("capital city of Japan")

<s> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
capital city of Japan

### Response:
Токио - Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы Қарағанды Ұлы
