In [1]:
import json
import os
import urllib

In [2]:
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode('utf-8')
        
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(text_data)
    
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    return data

In [3]:
# instruction fine-tuning dataset based on the book

file_path = 'instruction-data.json'
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print(f'Total Entries: {len(data)}')

Total Entries: 1100


In [4]:
print(f'Example:\n{data[50]}')

Example:
{'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [None]:
print(f'Example:\n{data[999]}')  # "input" field may be empty in the JSON

Example:
{'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


### Prompt style

In [6]:
def format_input(entry):
    # alpaca style

    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    # input_text will be empty if entry['input'] is empty
    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ""

    return instruction_text + input_text

In [7]:
model_input = format_input(data[50])
desired_response = f'\n\n### Response:\n{data[50]['output']}'
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [8]:
# empty "input"
model_input = format_input(data[999])
desired_response = f'\n\n### Response:\n{data[50]['output']}'
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
The correct spelling is 'Occasion.'


### Train/Val/Test Splits

In [9]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print(f'Train length: {len(train_data)}')
print(f'Val length: {len(val_data)}')
print(f'Test length: {len(test_data)}')

Train length: 935
Val length: 55
Test length: 110
