## 7.2 Preparing a dataset for supervised instruction fine-tuning


In [1]:
import json
import os
import urllib


def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:  # 1
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data


file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)


data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [3]:
import pprint


pprint.pp(data[0])

{'instruction': 'Evaluate the following phrase by transforming it into the '
                'spelling given.',
 'input': 'freind --> friend',
 'output': 'The spelling of the given phrase "freind" is incorrect, the '
           'correct spelling is "friend".'}


Instruction fine-tuning involves training a model on datasets like the one above.

The way we do so is by training them on the text, structuring it into certain formats (called prompt styles).

## Apply Alpaca prompt style template
```
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Evaluate the following phrase by transforming it into the spelling given.

### Input:
freind --> friend

### Output:
The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".
```

## Apply Phi-3 prompt style template
```
<|user|>
Evaluate the following phrase by transforming it into the spelling given: "freind --> friend" 

<|assistant|>
The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".
```


In [4]:
def format_input(entry):
    instruction_text = (
        "Below is an instruction that describes a task.\n"
        "Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry["instruction"]}"
    )

    # In case there is no input, e.g. for instructions like "What is an antonym of 'complicated'?"
    input_text = f"\n\n### Input:\n{entry["input"]}" if entry["input"] else ""

    return instruction_text + input_text

In [6]:
model_input = format_input(data[0])
desired_response = f"\n\n### Response:\n{data[0]["output"]}"
print(model_input + desired_response)

Below is an instruction that describes a task.
Write a response that appropriately completes the request.

### Instruction:
Evaluate the following phrase by transforming it into the spelling given.

### Input:
freind --> friend

### Response:
The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".


In [7]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print(f"Training set length: {len(train_data)}")
print(f"Validation set length: {len(val_data)}")
print(f"Test set length: {len(test_data)}")

Training set length: 935
Validation set length: 55
Test set length: 110


## 7.3 Organizing data into training batches

In [8]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in data:
            instruction_plut_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry["output"]}"
            full_text = instruction_plut_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __get_item__(self, index):
        return self.encoded_texts[index]
    
    def __len__(self):
        return len(self.data)