<a href="https://colab.research.google.com/github/gulabpatel/InstructModels/blob/main/Alpaca/DataMaker_for_Alpaca_style_custom_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

code walkthrough video : https://www.youtube.com/watch?v=ivXcInXR5jo

In [None]:
!pip -q install openai

In [None]:
!git clone https://github.com/tatsu-lab/stanford_alpaca.git

In [3]:
%cd stanford_alpaca

/content/stanford_alpaca


## Alpaca Data Creator


<img src="https://github.com/tatsu-lab/stanford_alpaca/blob/main/assets/parse_analysis.png?raw=1" alt="example image" width="600">

In [None]:
!pip install -r requirements.txt

In [5]:
import os
import openai

openai.api_key = 'XXXXXX'
os.environ['OPENAI_API_KEY'] = 'XXXXX'

## Data Generations

In [6]:
"""
batch_selfinstruct_generate.py
run:
python -m generate_instruction generate_instruction_following_data \
  --output_dir ./ \
  --num_instructions_to_generate 10 \
  --model_name="text-davinci-003" \
"""

'\nbatch_selfinstruct_generate.py\nrun:\npython -m generate_instruction generate_instruction_following_data   --output_dir ./   --num_instructions_to_generate 10   --model_name="text-davinci-003" '

In [7]:
import time
import json
import os
import random
import re
import string
from functools import partial
from multiprocessing import Pool
import numpy as np
import tqdm
from rouge_score import rouge_scorer
import utils
import fire

In [8]:
def encode_prompt(prompt_instructions):
    """Encode multiple prompt instructions into a single string."""
    prompt = open("./prompt.txt").read() + "\n"

    for idx, task_dict in enumerate(prompt_instructions):
        (instruction, input, output) = task_dict["instruction"], task_dict["input"], task_dict["output"]
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        input = "<noinput>" if input.lower() == "" else input
        prompt += f"###\n"
        prompt += f"{idx + 1}. Instruction: {instruction}\n"
        prompt += f"{idx + 1}. Input:\n{input}\n"
        prompt += f"{idx + 1}. Output:\n{output}\n"
    prompt += f"###\n"
    prompt += f"{idx + 2}. Instruction:"
    return prompt

def post_process_gpt3_response(num_prompt_instructions, response):
    if response is None:
        return []
    raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]
    raw_instructions = re.split("###", raw_instructions)
    instructions = []
    for idx, inst in enumerate(raw_instructions):
        # if the decoding stops due to length, the last example is likely truncated so we discard it
        if idx == len(raw_instructions) - 1 and response["finish_reason"] == "length":
            continue
        idx += num_prompt_instructions + 1
        splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst)
        if len(splitted_data) != 7:
            continue
        else:
            inst = splitted_data[2].strip()
            input = splitted_data[4].strip()
            input = "" if input.lower() == "<noinput>" else input
            output = splitted_data[6].strip()
        # filter out too short or too long instructions
        if len(inst.split()) <= 3 or len(inst.split()) > 150:
            continue
        # filter based on keywords that are not suitable for language models.
        blacklist = [
            "image",
            "images",
            "graph",
            "graphs",
            "picture",
            "pictures",
            "file",
            "files",
            "map",
            "maps",
            "draw",
            "plot",
            "go to",
            "video",
            "audio",
            "music",
            "flowchart",
            "diagram",
        ]
        blacklist += []
        if any(find_word_in_string(word, inst) for word in blacklist):
            continue
        # We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions.
        # And it's a bit comfusing whether the model need to write a program or directly output the result.
        # Here we filter them out.
        # Note this is not a comprehensive filtering for all programming instructions.
        if inst.startswith("Write a program"):
            continue
        # filter those starting with punctuation
        if inst[0] in string.punctuation:
            continue
        # filter those starting with non-english character
        if not inst[0].isascii():
            continue
        instructions.append({"instruction": inst, "input": input, "output": output})
    return instructions


def find_word_in_string(w, s):
    return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)

def generate_instruction_following_data(
    output_dir="./",
    seed_tasks_path="./seed_tasks.jsonl",
    num_instructions_to_generate=100,
    model_name="text-davinci-003",
    num_prompt_instructions=3,
    request_batch_size=5,
    temperature=1.0,
    top_p=1.0,
    num_cpus=16,
):
    seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]
    seed_instruction_data = [
        {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
        for t in seed_tasks
    ]
    print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")

    os.makedirs(output_dir, exist_ok=True)
    request_idx = 0
    # load the LM-generated instructions
    machine_instruction_data = []
    if os.path.exists(os.path.join(output_dir, "regen.json")):
        machine_instruction_data = utils.jload(os.path.join(output_dir, "regen.json"))
        print(f"Loaded {len(machine_instruction_data)} machine-generated instructions")

    # similarities = {}
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

    # now let's generate new instructions!
    progress_bar = tqdm.tqdm(total=num_instructions_to_generate)
    if machine_instruction_data:
        progress_bar.update(len(machine_instruction_data))

    # first we tokenize all the seed instructions and generated machine instructions
    all_instructions = [d["instruction"] for d in seed_instruction_data] + [
        d["instruction"] for d in machine_instruction_data
    ]
    all_instruction_tokens = [scorer._tokenizer.tokenize(inst) for inst in all_instructions]

    while len(machine_instruction_data) < num_instructions_to_generate:
        request_idx += 1

        batch_inputs = []
        for _ in range(request_batch_size):
            # only sampling from the seed tasks
            prompt_instructions = random.sample(seed_instruction_data, num_prompt_instructions)
            prompt = encode_prompt(prompt_instructions)
            batch_inputs.append(prompt)
        decoding_args = utils.OpenAIDecodingArguments(
            temperature=temperature,
            n=1,
            max_tokens=3072,  # hard-code to maximize the length. the requests will be automatically adjusted
            top_p=top_p,
            stop=["\n20", "20.", "20."],
        )
        request_start = time.time()
        results = utils.openai_completion(
            prompts=batch_inputs,
            model_name=model_name,
            batch_size=request_batch_size,
            decoding_args=decoding_args,
            logit_bias={"50256": -100},  # prevent the <|endoftext|> token from being generated
        )
        request_duration = time.time() - request_start

        process_start = time.time()
        instruction_data = []
        for result in results:
            new_instructions = post_process_gpt3_response(num_prompt_instructions, result)
            instruction_data += new_instructions

        total = len(instruction_data)
        keep = 0
        for instruction_data_entry in instruction_data:
            # computing similarity with the pre-tokenzied instructions
            new_instruction_tokens = scorer._tokenizer.tokenize(instruction_data_entry["instruction"])
            with Pool(num_cpus) as p:
                rouge_scores = p.map(
                    partial(rouge_scorer._score_lcs, new_instruction_tokens),
                    all_instruction_tokens,
                )
            rouge_scores = [score.fmeasure for score in rouge_scores]
            most_similar_instructions = {
                all_instructions[i]: rouge_scores[i] for i in np.argsort(rouge_scores)[-10:][::-1]
            }
            if max(rouge_scores) > 0.7:
                continue
            else:
                keep += 1
            instruction_data_entry["most_similar_instructions"] = most_similar_instructions
            instruction_data_entry["avg_similarity_score"] = float(np.mean(rouge_scores))
            machine_instruction_data.append(instruction_data_entry)
            all_instructions.append(instruction_data_entry["instruction"])
            all_instruction_tokens.append(new_instruction_tokens)
            progress_bar.update(1)
        process_duration = time.time() - process_start
        print(f"Request {request_idx} took {request_duration:.2f}s, processing took {process_duration:.2f}s")
        print(f"Generated {total} instructions, kept {keep} instructions")
        utils.jdump(machine_instruction_data, os.path.join(output_dir, "regen.json"))

## the data

In [9]:
import json

file_path = "/content/stanford_alpaca/alpaca_data.json"
with open(file_path, 'r') as json_file:
    data = json.load(json_file)
# print(data)
data[245]

{'instruction': 'What is the scientific name for a beaver?',
 'input': '',
 'output': 'The scientific name for a beaver is Castor canadensis.'}

## Seed Tasks

In [11]:
seed_tasks_path="./seed_tasks.jsonl"

seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]

seed_instruction_data = [
    {"instruction": t["instruction"], "input": t["instances"][0]["input"], "output": t["instances"][0]["output"]}
    for t in seed_tasks
]
print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")

Loaded 175 human-written seed instructions


In [12]:
seed_tasks[12]

{'id': 'seed_task_12',
 'name': 'explain_behavior',
 'instruction': "Explain human's behavior.",
 'instances': [{'input': 'Behavior: cry.',
   'output': 'There could be many reasons why a person might cry. They could be feeling sad, scared, angry, or frustrated. Sometimes people cry when they are happy or relieved. There is no one answer to why people behave the way they do.'}],
 'is_classification': False}

## Creating a new set of seed tasks

In [13]:
new_seed_tasks = seed_tasks[:3]

In [14]:
new_seed_tasks

[{'id': 'seed_task_0',
  'name': 'breakfast_suggestion',
  'instruction': "Is there anything I can eat for a breakfast that doesn't include eggs, yet includes protein, and has roughly 700-1000 calories?",
  'instances': [{'input': '',
    'output': 'Yes, you can have 1 oatmeal banana protein shake and 4 strips of bacon. The oatmeal banana protein shake may contain 1/2 cup oatmeal, 60 grams whey protein powder, 1/2 medium banana, 1tbsp flaxseed oil and 1/2 cup watter, totalling about 550 calories. The 4 strips of bacon contains about 200 calories.'}],
  'is_classification': False},
 {'id': 'seed_task_1',
  'name': 'antonym_relation',
  'instruction': 'What is the relation between the given pairs?',
  'instances': [{'input': 'Night : Day :: Right : Left',
    'output': 'The relation between the given pairs is that they are opposites.'}],
  'is_classification': False},
 {'id': 'seed_task_2',
  'name': 'one_sentence_description',
  'instruction': 'Generate a one-sentence description for ea

{  
    'id': 'seed_task_1',  
  'name': 'antonym_relation',  
  'instruction': 'What is the relation between the given pairs?',  
  'instances': [{'input': 'Night : Day :: Right : Left',  
    'output': 'The relation between the given pairs is that they are opposites.'}],  
  'is_classification': False}

In [15]:
json_new_tasks = '''
[
    {
        "id": "new_seed_task_3",
        "name": "vegetarian_dinner_options",
        "instruction": "What are some vegetarian dinner options with high protein content?",
        "instances": [{"input": "",
                      "output": "Some vegetarian dinner options with high protein content include lentil soup, chickpea curry, tofu stir-fry, tempeh tacos, and quinoa stuffed peppers."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_4",
        "name": "refund_policy_explanation",
        "instruction": "Can you explain your refund policy?",
        "instances": [{"input": "",
                      "output": "Our refund policy allows customers to request a refund within 30 days of purchase if they are unsatisfied with the product or service. To be eligible for a refund, the product must be in its original condition and packaging, and services must not have been completed."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_5",
        "name": "troubleshoot_wifi_connection",
        "instruction": "I'm having trouble connecting to my Wi-Fi network. What should I do?",
        "instances": [{"input": "",
                      "output": "First, check if your device is within range of your Wi-Fi router. Then, verify that you have entered the correct network name and password. If the issue persists, try restarting both your device and the router. If you still cannot connect, contact your internet service provider for further assistance."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_6",
        "name": "change_password_instructions",
        "instruction": "How do I change my password?",
        "instances": [{"input": "",
                      "output": "To change your password, log in to your account, go to the settings or account management section, and find the option to change your password. Enter your current password, then enter your new password twice for confirmation. Save your changes to update your password."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_7",
        "name": "lost_package_inquiry",
        "instruction": "My package hasn't arrived yet. What should I do?",
        "instances": [{"input": "",
                      "output": "First, check your order confirmation email or account for the tracking number and delivery date estimate. If the delivery date has passed, contact the shipping carrier with your tracking number for an update. If the carrier cannot resolve the issue, please contact our customer support team for assistance."}],
        "is_classification": false
    },
    {
        "id": "new_seed_task_8",
        "name": "modify_order",
        "instruction": "I need to modify my order. How can I do that?",
        "instances": [{"input": "",
                      "output": "To modify your order, please contact our customer support team as soon as possible with your order number and the changes you would like to make. Please note that modifications may not be possible if the order has already been processed or shipped."}],
        "is_classification": false
    }
]
'''


In [16]:
new_seed_tasks = json.loads(json_new_tasks)
print(new_seed_tasks)

[{'id': 'new_seed_task_3', 'name': 'vegetarian_dinner_options', 'instruction': 'What are some vegetarian dinner options with high protein content?', 'instances': [{'input': '', 'output': 'Some vegetarian dinner options with high protein content include lentil soup, chickpea curry, tofu stir-fry, tempeh tacos, and quinoa stuffed peppers.'}], 'is_classification': False}, {'id': 'new_seed_task_4', 'name': 'refund_policy_explanation', 'instruction': 'Can you explain your refund policy?', 'instances': [{'input': '', 'output': 'Our refund policy allows customers to request a refund within 30 days of purchase if they are unsatisfied with the product or service. To be eligible for a refund, the product must be in its original condition and packaging, and services must not have been completed.'}], 'is_classification': False}, {'id': 'new_seed_task_5', 'name': 'troubleshoot_wifi_connection', 'instruction': "I'm having trouble connecting to my Wi-Fi network. What should I do?", 'instances': [{'in

In [17]:
with open('new_seed_tasks.jsonl', 'w') as outfile:
    for task_dict in new_seed_tasks:
        json.dump(task_dict, outfile)
        outfile.write('\n')

In [20]:
!mkdir new_tasks

In [21]:
!python -m generate_instruction generate_instruction_following_data --output_dir ./new_tasks/ --seed_tasks_path ./new_seed_tasks.jsonl --num_instructions_to_generate 5 --num_prompt_instructions 3 --request_batch_size 2 --num_cpus 4

Loaded 6 human-written seed instructions
  0% 0/5 [00:00<?, ?it/s]
prompt_batches:   0% 0/1 [00:00<?, ?it/s][A
prompt_batches: 100% 1/1 [00:39<00:00, 39.64s/it]
27it [00:40,  4.76it/s]Request 1 took 39.64s, processing took 1.24s
Generated 27 instructions, kept 27 instructions
27it [00:40,  1.51s/it]


In [22]:
generate_instruction_following_data(
    output_dir="./new_tasks/",
    seed_tasks_path="./new_seed_tasks.jsonl",
    num_instructions_to_generate=5,
    model_name="text-davinci-003",
    num_prompt_instructions=3,
    request_batch_size=2,
    temperature=1.0,
    top_p=1.0,
    num_cpus=4,
)

Loaded 6 human-written seed instructions
Loaded 27 machine-generated instructions


27it [00:00, 12525.85it/s]           


In [23]:
import json
file_path = "/content/stanford_alpaca/new_tasks/regen.json"

with open(file_path, 'r') as json_file:
    data = json.load(json_file)

# drop the most similar as measured by Rouge
for dictionary in data:
    dictionary.pop('most_similar_instructions', None)
    dictionary.pop('avg_similarity_score',None)

print(data)

[{'instruction': 'Classify this statement as a fact or an opinion: "Psychology is the scientific study of the mind and behavior."', 'input': 'Psychology is the scientific study of the mind and behavior.', 'output': 'This statement is a fact.'}, {'instruction': 'Write five adjectives to describe a laptop', 'input': '', 'output': 'A laptop can be described as powerful, portable, lightweight, versatile, and durable.'}, {'instruction': 'I want to rent an apartment. What do I need to do?', 'input': '', 'output': "To rent an apartment, you will need to make sure you meet the qualifications, including having a good credit score and sufficient income. You will also need to apply with the rental agency and prepare a rental application and documents proving your identity and income. Finally, you may need to pay a security deposit or first month's rent before you can move in."}, {'instruction': 'Convert 84 Fahrenheit to Celsius', 'input': '', 'output': '84 Fahrenheit is equivalent to 28.888888888

In [24]:
len(data)

27

In [25]:
data[16]

{'instruction': 'Generate a new username for me',
 'input': '',
 'output': 'GekyDim13'}

In [26]:
data[3]

{'instruction': 'Convert 84 Fahrenheit to Celsius',
 'input': '',
 'output': '84 Fahrenheit is equivalent to 28.88888888888889 Celsius.'}

In [27]:
data[5]

{'instruction': 'Write a rejection letter for a job candidate',
 'input': 'Dear John, \n\nWe appreciate your application to ABC Company. After careful consideration, we have decided not to move forward with your candidacy at this time.',
 'output': 'We appreciate your enthusiasm and qualifications, but unfortunately we have decided to move forward with another candidate. We wish you all the best in your future endeavors and thank you for your interest in ABC Company.'}