In [1]:
import json, random
from datasets import load_dataset
from demo_class import FlexFlowDemo

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def create_datasets(dataset_size=10, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'):
    """Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k.
    Only the 'open_qa' and 'closed_qa' prompts without context are kept.
    The datasets are saved into the files given as arguments.

    Keyword arguments:
    dataset_size -- the number of prompts to consider
    inference_file_path -- the file in which to save the inference data
    finetuning_file_path-- the file in which to save the finetuning data
    """
    dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
    data = []
    for row in dataset:
        if len(data) == dataset_size:
            break
        if ("open_qa" in row['category'] or "closed_qa" in row['category']) and len(row['context']) == 0:
            data.append(row['instruction'] + " " + row['response'])
    with open(inference_file_path, 'w') as file:
        json.dump(data[:1], file)
    with open(finetuning_file_path, 'w') as file:
        json.dump(data, file, indent=2, separators=(',', ': '))

In [3]:
configs_dict = {
    "num_gpus": 1,
    "memory_per_gpu": 8192,
    "zero_copy_memory_per_node": 12000,
    "num_cpus": 4,
    "legion_utility_processors": 4,
    "data_parallelism_degree": 1,
    "tensor_parallelism_degree": 1,
    "pipeline_parallelism_degree": 1,
    "offload": False,
    "offload_reserve_space_size": 8 * 1024,  # 8GB
    "use_4bit_quantization": False,
    "use_8bit_quantization": False,
    "enable_peft": True,
    "peft_activation_reserve_space_size": 1024,  # 1GB
    "peft_weight_reserve_space_size": 1024,  # 1GB
    "profiling": False,
    "inference_debugging": False,
    "fusion": False,
    "seed": 42,
}
model_configs = {
    "base_model": "JackFram/llama-160m",
    "inference_peft_model_id": "goliaro/llama-160m-lora",
    "finetuning_peft_model_id": "goliaro/llama-160m-lora",
    "cache_path": "",
    "refresh_cache": False,
    "full_precision": True,
    # relative paths
    "prompt": "inference_dataset.json",
    "finetuning_dataset": "finetuning_dataset.json",
    "output_file": "peft_demo.txt",
}
# Merge dictionaries
configs_dict.update(model_configs)

In [6]:
random.seed(configs_dict["seed"])

create_datasets(inference_file_path=configs_dict["prompt"], 
                finetuning_file_path=configs_dict["finetuning_dataset"])

demo = FlexFlowDemo(configs_dict)

demo.initialize_flexflow()
demo.start_server()
demo.generate_finetuning()
demo.stop_server()

Number of datapoints: 10
[0 - 7fe968e43280]    0.401319 {3}{Mapper}: Enabled Control Replication Optimizations.
[0 - 7fe968e43280]    0.401630 {3}{Mapper}: Enabled Control Replication Optimizations.
[0 - 7fe968e43280]    0.401717 {3}{Mapper}: Enabled Control Replication Optimizations.
[0 - 7fe968e43280]    0.401789 {3}{Mapper}: Enabled Control Replication Optimizations.
[0 - 7fe968e43280]    0.401857 {3}{Mapper}: Enabled Control Replication Optimizations.
[0 - 7fe968e43280]    0.569370 {3}{flexflow_c}: [FFConfig] new 0x8381f90
[0 - 7fe968e43280]    0.569650 {3}{flexflow_c}: [RequestManager] get 0xaaa00a0
[0 - 7fe968e43280]    0.569733 {3}{flexflow_c}: [RequestManager] set max_requests_per_batch 1
[0 - 7fe968e43280]    0.569759 {3}{flexflow_c}: [RequestManager] set max_tokens_per_batch 128
[0 - 7fe968e43280]    0.569822 {3}{flexflow_c}: [RequestManager] set max_sequence_length 256
[0 - 7fe968e43280]    0.569904 {3}{flexflow_c}: [RequestManager] set_enable_peft_finetuning 1
workSpaceSize