In [1]:
!pip install -r fine_tune_llama_requirements.txt

Collecting accelerate@ git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-install-7y7iierk/accelerate_a7e618f06e004e54bd553755e5cf0a81
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-install-7y7iierk/accelerate_a7e618f06e004e54bd553755e5cf0a81
  Resolved https://github.com/huggingface/accelerate.git to commit 31fd2b1ad6b9c1cd1480568399a311b3caaf62dc
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting transformers@ git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-install-7y7iierk/transformers_3a4afcf1e7c549aca93b9f9ee16df646
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-install-7y7iierk/transform

In [2]:
import argparse
import bitsandbytes as bnb
import torch
import os
import yaml
import json
import csv
import pandas as pd

from datasets import Dataset
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import Dataset as PyTorchDataset
from random import randint
from tqdm import tqdm

# Usage
config_file = 'config.yaml'
selected_model = 'Llama-2-7b-hf'
with open(config_file, 'r') as file:
    config = yaml.safe_load(file)
model_config = config['models'].get(selected_model)

if model_config:
    model_name = model_config['model_name']
    source_directory = model_config['source_directory']
    destination_directory = model_config['destination_directory']
    print(f"Model configuration for {selected_model} loaded successfully.")
    print(f"model_name={model_name}")
    print(f"source_directory={source_directory}")
    print(f"destination_directory={destination_directory}")
else:
    print(f"Model configuration for {selected_model} not found.")

Model configuration for Llama-2-7b-hf-c2t-2-004 loaded successfully.
model_name=
source_directory=/notebooks/models/fine-tuned/Llama-2-7b-hf-c2t-2-004
destination_directory=/notebooks/models/fine-tuned/Llama-2-7b-hf-c2t-2-005


In [3]:
def load_model(bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{81920}MB'
    
    try:
        model = AutoModelForCausalLM.from_pretrained(
            source_directory,
            quantization_config=bnb_config,
            device_map="auto", # dispatch efficiently the model on the available ressources
            max_memory = {i: max_memory for i in range(n_gpus)},
        )
        tokenizer = AutoTokenizer.from_pretrained(
            source_directory,
            use_fast=False,
            add_eos_token=True
        )
        print("Local model loaded successfully")
        
    except EnvironmentError:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto", # dispatch efficiently the model on the available ressources
            max_memory = {i: max_memory for i in range(n_gpus)},
            token='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',
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_name, 
            token='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',
            use_fast=False,
            add_eos_token=True
        )
        print("Model loaded successfully")

    # # Needed for LLaMA tokenizer
    # tokenizer.pad_token = tokenizer.eos_token
    
    tokenizer.pad_token_id = 18610

    return model, tokenizer

In [4]:
class ChartToTextDataset(PyTorchDataset):
    def __init__(self, data_dir, max_length=512):
        self.data_dir = data_dir
        self.max_length = max_length
        self.data_files = os.listdir(os.path.join(data_dir, "data"))
        
    def __len__(self):
        return len(self.data_files)
    
    def __getitem__(self, idx):
        data_file = os.path.join(self.data_dir, "data", self.data_files[idx])
        title_file = os.path.join(self.data_dir, "titles", self.data_files[idx].replace(".csv", ".txt"))
        caption_file = os.path.join(self.data_dir, "captions", self.data_files[idx].replace(".csv", ".txt"))

            
        # Read data table
        rows = []
        with open(data_file, 'r') as f:
            csv_reader = csv.reader(f)
            for row in csv_reader:
                rows.append(", ".join(row))
        data = "; ".join(rows)

        with open(title_file, 'r') as f:
            title = f.read().strip()
        
        with open(caption_file, 'r') as f:
            caption = f.read().strip()

        input_text = title + "\n" + data

        return {
            'input': input_text,
            'output': caption
        }
    
class ChartSummDataset(PyTorchDataset):
    def __init__(self, json_file, max_length=512):
        self.json_file = json_file
        self.max_length = max_length
        self.data = self._load_data()

    def _load_data(self):
        with open(self.json_file, 'r') as file:
            return json.load(file)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        x_label = item["x_label"]
        y_labels = item["y_label"]
        data = item["data"]
        title = item["title"]
        summary = item["summary"]

        # Convert data to text format
        data_text = f"{x_label}: {', '.join(map(str, data[x_label]))}\n"
        for y_label in y_labels:
            data_text += f"{y_label}: {', '.join(map(str, data[y_label]))}\n"

        input_text = f"{title}\n{data_text}"

        return {
            'input': input_text,
            'output': summary
        }

class OWIDDataset(Dataset):
    def __init__(self, info_file_path, max_length=512):
        self.max_length = max_length
        self.info_file_path = info_file_path
        # Extract the base directory from the info file path
        self.base_dir = os.path.dirname(info_file_path)
        self.items = self._load_info()
    
    def _load_info(self):
        items = []
        with open(self.info_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)  # Skip header row if present
            for row in csv_reader:
                item_type, topic, summary, title, subtitle, file_name = row
                items.append({
                    'type': item_type,
                    'topic': topic,
                    'summary': summary,
                    'title': title,
                    'subtitle': subtitle,
                    'file_name': file_name
                })
        return items
    
    def __len__(self):
        return len(self.items)
    
    def __getitem__(self, idx):
        item = self.items[idx]
        data_file = os.path.join(self.base_dir, "csv", item['file_name'] + ".csv")
        
        # Read data from CSV file
        rows = []
        with open(data_file, 'r') as file:
            csv_reader = csv.reader(file)
            for row in csv_reader:
                rows.append(", ".join(row))
        data = "; ".join(rows)
        
        title = f"Title: {item['title']}"
        subtitle = f"Subtitle: {item['subtitle']}"
        type_topic = f"Type: {item['type']}, Topic: {item['topic']}"
        summary = item['summary']
        
        # Formatting the input text with title, subtitle, type, topic, and data
        input_text = f"{title}\n{subtitle}\n{type_topic}\n{data}"
        
        return {
            'input': input_text,
            'output': summary
        }


class CombinedDataset(Dataset):
    def __init__(self, datasets):
        self.datasets = datasets
        self.lengths = [len(dataset) for dataset in datasets]

    def __len__(self):
        return sum(self.lengths)
    
    def __getitem__(self, idx):
        for dataset, length in zip(self.datasets, self.lengths):
            if idx < length:
                return dataset[idx]
            idx -= length
        raise IndexError("Index out of range in CombinedDataset")

In [5]:
def load_dataset(combined_dataset, start=0, end=None):
    data = []
    end = len(combined_dataset) if end is None else end
    for i in tqdm(range(start, end), desc="Loading dataset"):
        data.append(combined_dataset[i])

    df = pd.DataFrame(data)

    dataset = Dataset.from_pandas(df)

    print(f'Number of prompts: {len(dataset)}')
    print(f'Column names are: {dataset.column_names}')
    
    return dataset

In [6]:
# def create_prompt_formats(sample):
#     blurb = "Below is the full content of a chart. Write a appropriate summary that reflects the meaning and trend of the chart."
#     input_context = f"### Chart content: {sample['input']}"
#     response = f"### Chart summary: {sample['output']}"
#     end = "### End"
    
#     parts = [part for part in [blurb, input_context, response, end] if part]

#     formatted_prompt = "\n\n".join(parts)
    
#     sample["text"] = formatted_prompt

#     return sample

# def create_prompt_formats(sample):
#     INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
#     INSTRUCTION_KEY = "### Instruction:"
#     INPUT_KEY = "### Input:"
#     RESPONSE_KEY = "### Response:"
#     END_KEY = "### End"
    
#     blurb = f"{INTRO_BLURB}"
#     instruction = f"{INSTRUCTION_KEY}\nFrom the input full content of a chart, write a summary that reflects the meaning and trend of the chart."
#     input_context = f"{INPUT_KEY}\n{sample['input']}"
#     response = f"{RESPONSE_KEY}\n{sample['output']}"
#     end = f"{END_KEY}"
    
#     parts = [part for part in [blurb, instruction, input_context, response, end] if part]

#     formatted_prompt = "\n\n".join(parts)
    
#     sample["text"] = formatted_prompt

#     return sample

def create_prompt_formats(sample):
    formatted_prompt = f"""<s>[INST] From the below input full content of a chart, write a summary that reflects the meaning and trend of the chart.
    Chart content: {sample['input']} [/INST] {sample['output']}"""
    
    sample["text"] = formatted_prompt

    return sample

In [7]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)
    
    print(dataset[randint(0,len(dataset))])
    
    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["input", "output", "text"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [8]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

In [9]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

In [10]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [11]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [12]:
bnb_config = create_bnb_config()

model, tokenizer = load_model(bnb_config)
## Preprocess dataset

max_length = get_max_length(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Local model loaded successfully
Found max lenth: 4096


In [13]:
# Assuming you have already defined and instantiated datasets
training_dataset_config = config['datasets']['training']
owid_train_dataset = OWIDDataset(training_dataset_config['owid']['train'])
owid_validation_dataset = OWIDDataset(training_dataset_config['owid']['validation'])

train_dataset = load_dataset(CombinedDataset([owid_train_dataset]))
validation_dataset = load_dataset(CombinedDataset([owid_validation_dataset]))

Loading dataset: 100%|██████████| 5356/5356 [00:00<00:00, 55326.71it/s]

Number of prompts: 5356
Column names are: ['input', 'output']





In [14]:
seed = randint(0, 10000)
train_dataset = preprocess_dataset(tokenizer, max_length, seed, train_dataset)
validation_dataset = preprocess_dataset(tokenizer, max_length, seed, validation_dataset)

Preprocessing dataset...


Map:   0%|          | 0/5356 [00:00<?, ? examples/s]

{'input': 'Lithuania - Dentistry personnel\nYear: 1995, 1996, 1997, 1998, 1999, 2010, 2011, 2012, 2013, 2014, 2015\nDentistry personnel: 0.48, 0.474, 0.602, 0.637, 0.656, 0.786, 0.81, 0.89, 0.904, 0.915, 0.919\n', 'output': 'In 2015 , dentistry personnel for Lithuania was 0.9 number per thousand population . Dentistry personnel of Lithuania increased from 0.5 number per thousand population in 1996 to 0.9 number per thousand population in 2015 growing at an average annual rate of 7.97 % .', 'text': '<s>[INST] From the below input full content of a chart, write a summary that reflects the meaning and trend of the chart.\n    Chart content: Lithuania - Dentistry personnel\nYear: 1995, 1996, 1997, 1998, 1999, 2010, 2011, 2012, 2013, 2014, 2015\nDentistry personnel: 0.48, 0.474, 0.602, 0.637, 0.656, 0.786, 0.81, 0.89, 0.904, 0.915, 0.919\n [/INST] In 2015 , dentistry personnel for Lithuania was 0.9 number per thousand population . Dentistry personnel of Lithuania increased from 0.5 number p

Map:   0%|          | 0/5356 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5356 [00:00<?, ? examples/s]

In [15]:
def train(model, tokenizer, train_dataset, validation_dataset, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)
    
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    
    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        args=TrainingArguments(
            per_device_train_batch_size=8,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            # max_steps=1000,
            num_train_epochs=2, # 2,
            learning_rate=2e-5,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
            evaluation_strategy="steps",  # Evaluate every logging_steps or set to "epoch" for end-of-epoch evaluation
            save_strategy="steps",  # Optional: Save checkpoints every logging_steps
            save_total_limit=3,  # Optional: Keep only the last 3 checkpoints
            load_best_model_at_end=True,  # Load the best model (based on eval_loss) at the end of training
            metric_for_best_model="eval_loss"  # Choose the metric to determine the best model

        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs
    
    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training
    
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)
     
    do_train = True
    
    # Launch training
    print("Training...")
    
    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)    
    
    ###
        
    val_metrics = trainer.evaluate(eval_dataset=validation_dataset)
    print("Validation metrics:", val_metrics)
    
    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    
    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()
    
    
output_dir = "tmp/llama2/final_checkpoint"
train(model, tokenizer, train_dataset, validation_dataset, output_dir)

all params: 3,540,389,888 || trainable params: 39,976,960 || trainable%: 1.1291682911958425
torch.float32 302387200 0.08541070604255438
torch.uint8 3238002688 0.9145892939574456
Training...


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
1,0.7132
2,0.733
3,0.7262
4,0.84
5,0.8548
6,0.7975
7,0.8124
8,0.7654
9,0.782
10,0.8522


***** train metrics *****
  epoch                    =        1.99
  total_flos               = 517730587GF
  train_loss               =      0.7508
  train_runtime            =  1:48:07.89
  train_samples_per_second =       1.651
  train_steps_per_second   =       0.051
{'train_runtime': 6487.8987, 'train_samples_per_second': 1.651, 'train_steps_per_second': 0.051, 'total_flos': 5.559089851645624e+17, 'train_loss': 0.7507595837473156, 'epoch': 1.99}
Saving last checkpoint of the model...


In [16]:
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16, token='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')
model = model.merge_and_unload()

output_merged_dir = destination_directory
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir, safe_serialization=True)

# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(
    'meta-llama/Llama-2-7b-hf', 
    token='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',
    use_fast=False,
    add_eos_token=True
)
tokenizer.pad_token_id = 18610
tokenizer.save_pretrained(output_merged_dir)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

('/notebooks/models/fine-tuned/Llama-2-7b-hf-c2t-2-005/tokenizer_config.json',
 '/notebooks/models/fine-tuned/Llama-2-7b-hf-c2t-2-005/special_tokens_map.json',
 '/notebooks/models/fine-tuned/Llama-2-7b-hf-c2t-2-005/tokenizer.model',
 '/notebooks/models/fine-tuned/Llama-2-7b-hf-c2t-2-005/added_tokens.json')

In [17]:
# tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf', token='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')
# tokenizer.save_pretrained('/notebooks/models/fine-tuned/Llama-2-7b-chat-hf-005')