In [1]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch

# Fine-tuning parameters
max_seq_length = 2048
dtype = None
# Use 4bit quantization to reduce memory usage.
load_in_4bit = True

# Dataset format
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [5]:
from google.colab import drive
drive.mount('/content/drive')

model_name = "LLAMA_base_model"

# Save the model to your Drive
base_model.save_pretrained(f"/content/drive/My Drive/Fine_tuned_model/{model_name}")
tokenizer.save_pretrained(f"/content/drive/My Drive/Fine_tuned_model/{model_name}")

if False:
  # Load the model from your Drive
  from unsloth import FastLanguageModel
  base_model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = f"/content/drive/My Drive/Fine_tuned_model/{model_name}",
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
  )

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Initialize EOS token
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset

dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

train_testvalid = dataset.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

final_datasets = {
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test'],
}

README.md:   0%|          | 0.00/905 [00:00<?, ?B/s]

(…)-00000-of-00001-8b6e212f3e1ece96.parquet:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18612 [00:00<?, ? examples/s]

Map:   0%|          | 0/18612 [00:00<?, ? examples/s]

In [None]:
# Extracting the 'output' field from the test dataset into a separate list
chosen_field_list = [item["output"] for item in final_datasets['train'].values()]
prompts_field_list = [item["instruction"] for item in final_datasets['train'].values()]

In [None]:
import numpy as np
from transformers import TextStreamer
import re
import json
from google.colab import files, drive

download_time = 0
download_step = 100

drive.mount('/content/drive', force_remount=True)

def download_current_data_bag(rejected_field_list):
    data = [{"prompt": prompt, "chosen": chosen, "rejected": rejected}
            for prompt, chosen, rejected in zip(prompts_field_list, chosen_field_list, rejected_field_list)]

    global download_time
    download_time += download_step
    filename = f'Data_DPO_FT_{download_time}.json'

    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

    # Save to Google Drive
    with open('/content/drive/My Drive/DPO_FT_Data/' + filename, 'w') as f:
        json.dump(data, f, indent=4)

# Function to extract the response part
def extract_response(generated_text):
    cleaned_text = generated_text.replace("<|end_of_text|>", "")

    match = re.search(r"### Response:\n([\s\S]+)", cleaned_text)

    if match:
        return match.group(1).strip()
    return cleaned_text

# Debug parameter
debug = 0

if debug == True:
  custom_dict = {}
  for i in range(10):
    custom_dict[i] = final_datasets['train'][i]
  final_datasets['train'] = custom_dict
else:
  custom_dict = {}
  for i in range(len(final_datasets['train'])):
    custom_dict[i] = final_datasets['train'][i]
  final_datasets['train'] = custom_dict

FastLanguageModel.for_inference(base_model)

current_output = 0
total_output = len(final_datasets["train"])

# Function to generate outputs for the test dataset
def generate_outputs(dataset, model, tokenizer):
    global current_output
    outputs = []

    infer_count = 0

    for item in dataset.values():
        inputs = tokenizer(
            [alpaca_prompt.format(item['instruction'], item['input'], "")],
            return_tensors="pt"
        ).to("cuda")

        # Generate output from the model
        text_streamer = TextStreamer(tokenizer)
        generated_outputs = model.generate(**inputs, max_new_tokens=max_seq_length)
        generated_text = tokenizer.decode(generated_outputs[0], skip_special_tokens=False)

        response = extract_response(generated_text)

        # Append the generated output to the list
        outputs.append(response)

        current_output +=1
        print(f"Progress: {current_output}/{total_output}")

        infer_count += 1

        if infer_count == download_step:
            download_current_data_bag(outputs)
            infer_count = 0

    return outputs

# Generate outputs for the test dataset
base_test_outputs = generate_outputs(final_datasets['train'], base_model, tokenizer)

Mounted at /content/drive
Progress: 1/14889
Progress: 2/14889
Progress: 3/14889
Progress: 4/14889
Progress: 5/14889
Progress: 6/14889
Progress: 7/14889
Progress: 8/14889
Progress: 9/14889
Progress: 10/14889
Progress: 11/14889
Progress: 12/14889
Progress: 13/14889
Progress: 14/14889
Progress: 15/14889
Progress: 16/14889
Progress: 17/14889
Progress: 18/14889
Progress: 19/14889
Progress: 20/14889
Progress: 21/14889
Progress: 22/14889
Progress: 23/14889
Progress: 24/14889
Progress: 25/14889
Progress: 26/14889
Progress: 27/14889
Progress: 28/14889
Progress: 29/14889
Progress: 30/14889
Progress: 31/14889
Progress: 32/14889
Progress: 33/14889
Progress: 34/14889
Progress: 35/14889
Progress: 36/14889
Progress: 37/14889
Progress: 38/14889
Progress: 39/14889
Progress: 40/14889
Progress: 41/14889
Progress: 42/14889
Progress: 43/14889
Progress: 44/14889
Progress: 45/14889
Progress: 46/14889
Progress: 47/14889
Progress: 48/14889
Progress: 49/14889
Progress: 50/14889
Progress: 51/14889
Progress: 52/1

In [None]:
!pip install datasets huggingface_hub

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

file_path = '/content/drive/My Drive/DPO_FT_Data/Data_DPO_FT_2000.json'

Mounted at /content/drive


In [None]:
from datasets import load_dataset

dataset = load_dataset('json', data_files=file_path)

train_testvalid = dataset['train'].train_test_split(test_size=0.1, seed=42)

train_dataset = train_testvalid['train']
validation_dataset = train_testvalid['test']

print(len(train_dataset))
print(len(validation_dataset))

1800
200


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import DatasetDict

dataset_hub = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset
})
dataset_hub.push_to_hub('quangduc1112001/python-code-DPO-fine-tune')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/365 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/quangduc1112001/python-code-DPO-fine-tune/commit/3a87c004b4d9487c282dfeeec2dfeda682a1b7db', commit_message='Upload dataset', commit_description='', oid='3a87c004b4d9487c282dfeeec2dfeda682a1b7db', pr_url=None, pr_revision=None, pr_num=None)