In [2]:
# Reference: https://beta.openai.com/docs/guides/fine-tuning

"""
Currently there is no fee for fine-tuning the model itself, however we will be introducing a fee for this starting December 8th 2021 at 00:00 UTC. After this time, trained tokens will be billed at 50% the base model inference rate per 1k tokens (trained tokens = tokens in file * epochs).

We will also begin supporting unlimited monthly fine-tunes and file sizes on December 8th, but until then there is a limit of 10 fine-tuning runs per month and data sets are limited to 2.5M tokens (a file size that's roughly 80-100MB).
"""

# There are some limitations put in at the moment

import openai

from fp_dataset_artifacts.utils import init_openai
from datasets import list_datasets, load_dataset, list_metrics, load_metric

init_openai()

data = load_dataset('snli')
data

Reusing dataset snli (/home/x/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

In [5]:
"""Fine tuning expects the following format:

{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
...

"""

int2label = data['train'].features['label'].int2str

def to_example(data, include_label=True):
    sentences = []
    
    for feature in data.keys():                
        text = data[feature]
        
        if feature == 'label':
            if not include_label:
                text = ''
            else:
                text = int2label(text).capitalize()
            
        sentence = f'{feature.capitalize()}: {text}'
        sentences.append(sentence)
        
    return '\n'.join(sentences)


def format_jsonl(x):
     return {
         'prompt': to_example(x, False),
         'completion': int2label(x['label']).capitalize()
     }
    

def build_jsonl(data, filename):
    import json
    
    with open(filename, 'w') as f:
        for x in data:
            if x['label'] >= 0:
                f.write(json.dumps(format_jsonl(x)) + '\n') 

build_jsonl(data['train'], 'snli_finetune_train.jsonl')
build_jsonl(data['validation'], 'snli_finetune_validation.jsonl')

In [6]:
# Upload files
openai.File.create(file=open("snli_finetune_train.jsonl"), purpose="fine-tune")

<File file id=file-zESrbwb1mh3p2OjVzhcsKmH4 at 0x7f1fda351f40> JSON: {
  "bytes": 98207620,
  "created_at": 1638409664,
  "filename": "snli_finetune_train.jsonl",
  "id": "file-zESrbwb1mh3p2OjVzhcsKmH4",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [7]:
openai.File.create(file=open("snli_finetune_validation.jsonl"), purpose="fine-tune")

<File file id=file-QC8NTdPW56lhbaDtuxvofGLj at 0x7f203a616810> JSON: {
  "bytes": 1822513,
  "created_at": 1638409667,
  "filename": "snli_finetune_validation.jsonl",
  "id": "file-QC8NTdPW56lhbaDtuxvofGLj",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [13]:
openai.FineTune.create(
    training_file="file-zESrbwb1mh3p2OjVzhcsKmH4",
    validation_file="file-QC8NTdPW56lhbaDtuxvofGLj",
    model="curie",
    n_epochs=4, # default
    compute_classification_metrics=True,
    classification_n_classes=3
)

<FineTune fine-tune id=ft-dUn3kTdJRe33NcVQlngm2L1n at 0x7f1fda35c900> JSON: {
  "created_at": 1638409797,
  "events": [
    {
      "created_at": 1638409797,
      "level": "info",
      "message": "Created fine-tune: ft-dUn3kTdJRe33NcVQlngm2L1n",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "classification_n_classes": 3,
    "compute_classification_metrics": true,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.1,
    "use_packing": null
  },
  "id": "ft-dUn3kTdJRe33NcVQlngm2L1n",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-5AE307Eg4rc5EAoEA2S2bwkH",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 98207620,
      "created_at": 1638409664,
      "filename": "snli_finetune_train.jsonl",
      "id": "file-zESrbwb1mh3p2OjVzhcsKmH4",
      "object": "file",
      "purpose": "fine-tune",
      "status": "error",
  

In [23]:
# Too many tokens, I'm allowed about 10% of the entire training dataset based on the current limit
int(len(data['train']) * 0.1)

55015

In [26]:
sample_data = data['train'].shuffle().select([i for i in range(55015)])
sample_data

Loading cached shuffled indices for dataset at /home/x/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-81adece7851b63ef.arrow


Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 55015
})

In [27]:
build_jsonl(sample_data, 'snli_finetune_train_sample.jsonl')
openai.File.create(file=open("snli_finetune_train_sample.jsonl"), purpose="fine-tune")

<File file id=file-pD1lpBznzyfCVq8U8rOV58RE at 0x7f1fda16e270> JSON: {
  "bytes": 9805852,
  "created_at": 1638410623,
  "filename": "snli_finetune_train_sample.jsonl",
  "id": "file-pD1lpBznzyfCVq8U8rOV58RE",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [28]:
# Try creating a fine tune with smaller dataset
openai.FineTune.create(
    training_file="file-pD1lpBznzyfCVq8U8rOV58RE",
    validation_file="file-QC8NTdPW56lhbaDtuxvofGLj",
    model="curie",
    n_epochs=4, # default
    compute_classification_metrics=True,
    classification_n_classes=3
)

<FineTune fine-tune id=ft-liaJqXchRH2a0b7InmAZuvEG at 0x7f1fda0caf90> JSON: {
  "created_at": 1638410655,
  "events": [
    {
      "created_at": 1638410655,
      "level": "info",
      "message": "Created fine-tune: ft-liaJqXchRH2a0b7InmAZuvEG",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "classification_n_classes": 3,
    "compute_classification_metrics": true,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.1,
    "use_packing": null
  },
  "id": "ft-liaJqXchRH2a0b7InmAZuvEG",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-5AE307Eg4rc5EAoEA2S2bwkH",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 9805852,
      "created_at": 1638410623,
      "filename": "snli_finetune_train_sample.jsonl",
      "id": "file-pD1lpBznzyfCVq8U8rOV58RE",
      "object": "file",
      "purpose": "fine-tune",
      "status": "proc

In [29]:
openai.FineTune.list()

<OpenAIObject list at 0x7f1fda351450> JSON: {
  "data": [
    {
      "created_at": 1638409797,
      "fine_tuned_model": null,
      "hyperparams": {
        "batch_size": null,
        "classification_n_classes": 3,
        "compute_classification_metrics": true,
        "learning_rate_multiplier": null,
        "n_epochs": 4,
        "prompt_loss_weight": 0.1,
        "use_packing": null
      },
      "id": "ft-dUn3kTdJRe33NcVQlngm2L1n",
      "model": "curie",
      "object": "fine-tune",
      "organization_id": "org-5AE307Eg4rc5EAoEA2S2bwkH",
      "result_files": [],
      "status": "failed",
      "training_files": [
        {
          "bytes": 98207620,
          "created_at": 1638409664,
          "filename": "snli_finetune_train.jsonl",
          "id": "file-zESrbwb1mh3p2OjVzhcsKmH4",
          "object": "file",
          "purpose": "fine-tune",
          "status": "error",
          "status_details": "The file contains 20821169 tokens and exceeds our 3000000 token limit. 

In [36]:
openai.FineTune.retrieve('ft-liaJqXchRH2a0b7InmAZuvEG')

<FineTune fine-tune id=ft-liaJqXchRH2a0b7InmAZuvEG at 0x7f1fda0ceb80> JSON: {
  "created_at": 1638410655,
  "events": [
    {
      "created_at": 1638410655,
      "level": "info",
      "message": "Created fine-tune: ft-liaJqXchRH2a0b7InmAZuvEG",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1638410659,
      "level": "info",
      "message": "Fine-tune enqueued. Queue number: 0",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1638410663,
      "level": "info",
      "message": "Fine-tune started",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": 2,
    "classification_n_classes": 3,
    "compute_classification_metrics": true,
    "learning_rate_multiplier": 0.2,
    "n_epochs": 4,
    "prompt_loss_weight": 0.1,
    "use_packing": true
  },
  "id": "ft-liaJqXchRH2a0b7InmAZuvEG",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-5AE307Eg4rc5EAoEA2S2bwkH",
  "result

In [41]:
openai.FineTune.list_events('ft-liaJqXchRH2a0b7InmAZuvEG')

<OpenAIObject list at 0x7f1fda0ceef0> JSON: {
  "data": [
    {
      "created_at": 1638410655,
      "level": "info",
      "message": "Created fine-tune: ft-liaJqXchRH2a0b7InmAZuvEG",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1638410659,
      "level": "info",
      "message": "Fine-tune enqueued. Queue number: 0",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1638410663,
      "level": "info",
      "message": "Fine-tune started",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1638411178,
      "level": "info",
      "message": "Completed epoch 1/4",
      "object": "fine-tune-event"
    }
  ],
  "object": "list"
}