In [9]:
import os
import json
import re

import tiktoken

import pandas as pd
import numpy as np

from openai import OpenAI

In [5]:
os.environ['OPENAI_API_KEY'] = ''

In [6]:
client = OpenAI()

In [11]:
# Pricing and default n_epochs estimate
MAX_TOKENS = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

# Estimate the number of tokens that will be charged for during training
def estimate_tokens(dataset, assistant_tokens):
    # Set the initial number of epochs to the target epochs
    n_epochs = TARGET_EPOCHS

    # Get the number of examples in the dataset
    n_train_examples = len(dataset)

    # If the examples total is less than the minimum target
    # adjust the epochs to ensure we have enough examples for
    # training
    if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
        n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
    
    # If the  number of examples is more than the maximum target
    # adjust the  epochs to ensure we don't exceed the maximum 
    # for training
    elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
        n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

    # Calculate the total number of tokens in the dataset
    n_billing_tokens_in_dataset = sum(min(MAX_TOKENS, length) for length in assistant_tokens)

    # Print the total token count that will be charged during training
    print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")

    # Print the default number of epochs for training
    print(f"You will train for {n_epochs} epochs on this dataset")

    # Print the total number of tokens that will be charged during training
    print(f"You will be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

    # If the total token count exceeds the maximum tokens, print a warning 
    if n_billing_tokens_in_dataset > MAX_TOKENS:
        print(f"WARNING: Your dataset contains examples longer than 4K tokens by {n_billing_tokens_in_dataset - MAX_TOKENS} tokens.")
        print("You will be charged for the full length of these examples during training, but only the first 4K tokens will be used for training.")

# Print the number of tokens in the messages
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

# print the number of tokens in the assistant messages
def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

# Print the distribution of values
def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Leer JSON

In [2]:
dataset = None
with open('emojis.jsonl', 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

In [4]:
print(dataset[0])

{'messages': [{'role': 'system', 'content': "You're a chatbot that only responds with emojis!"}, {'role': 'user', 'content': 'I just passed my driving test!'}, {'role': 'assistant', 'content': '(party)'}]}


In [12]:
encoding = tiktoken.get_encoding("cl100k_base")

file = 'emojis.jsonl'
with open(file, "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

total_tokens = []
assistant_tokens = []

for ex in dataset:
    messages = ex.get("messages", {})
    total_tokens.append(num_tokens_from_messages(messages))
    assistant_tokens.append(num_assistant_tokens_from_messages(messages))

print_distribution(total_tokens, "total tokens")
print_distribution(assistant_tokens, "assistant tokens")
estimate_tokens(dataset, assistant_tokens)
print(f"Processing file completed: {file}")


#### Distribution of total tokens:
min / max: 31, 50
mean / median: 36.838312829525485, 37.0
p5 / p95: 34.0, 40.0

#### Distribution of assistant tokens:
min / max: 2, 10
mean / median: 4.050966608084359, 4.0
p5 / p95: 3.0, 6.0
Dataset has ~2305 tokens that will be charged for during training
You will train for 3 epochs on this dataset
You will be charged for ~6915 tokens
Processing file completed: emojis.jsonl


# Upload and finetune

In [13]:
# Upload the training and validation dataset files to Azure OpenAI with the SDK.
file = client.files.create(file=open(file, "rb"), purpose="fine-tune")

print("Training file ID:", file.id)
print("Training file name:", file.filename)

Training file ID: file-HGB2NrCGDueg6xhv6asQX1
Training file name: emojis.jsonl


In [17]:
API_VERSION = '2023-09-15-preview'

ft = client.fine_tuning.jobs.create(
    training_file="file-HGB2NrCGDueg6xhv6asQX1",
    model="gpt-3.5-turbo-1106",
    hyperparameters={
        "n_epochs":3
    },
    suffix="emoji"
)
print("Finetuning job ID:", ft.id)

Finetuning job ID: ftjob-EzA3GXhVl4S4js5BDwcKo194


In [24]:
# List all the FT jobs
ft_jobs = client.fine_tuning.jobs.list()

for ft_job in ft_jobs:
    print(ft_job.id, ft_job.status)

ftjob-EzA3GXhVl4S4js5BDwcKo194 succeeded
