In [170]:
import pandas as pd
import openai
from mcts_main import grid_search
from sequence_generator import make_possible_terms
import numpy as np
import math

In [171]:
df = pd.read_csv('data/train/2/2.csv', names=['prompt', 'completion'], delimiter='],', engine='python')

In [172]:
df.head()

Unnamed: 0,prompt,completion
0,"[2, 3, 1, 0, 0, 0, 0","[False, False, False, False, False, False, Fal..."
1,"[3, 1, 2, 9, 1, 4, 81","[False, False, False, False, False, True, Fals..."
2,"[4, 16, 36, 64, 100, 144, 196","[False, False, True, True, False, False, False..."
3,"[1, 2, 2, 8, 32, 80, 416","[False, False, False, False, True, False, Fals..."
4,"[1, 3, 1, 5, 7, 5, 9","[True, False, False, False, False, False, Fals..."


In [173]:
df['prompt'] = df['prompt'].str.slice(1)

In [174]:
df['completion'] = df['completion'].str.slice(1, -1) + ' <EOS>'

In [175]:
df.to_json("data/train/2/2.jsonl", orient='records', lines=True)

In [116]:
!openai tools fine_tunes.prepare_data -f data/train/2/2.jsonl -q

Analyzing...

- Your file contains 800 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more det

In [8]:
!openai api fine_tunes.create -t "data/train/3/3_prepared_train.jsonl" -v "data/train/3/3_prepared_valid.jsonl" -m davinci

Found potentially duplicated files with name '2_prepared_train.jsonl', purpose 'fine-tune' and size 89749 bytes
file-D8NwYn4Q83Md5tcTV7ypoUrL
file-QvBbTiiQnKfYKgiQWblzvwib
Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: ^C



In [77]:
# nterms=2, davinci, 4 epochs, davinci:ft-personal-2022-11-29-06-40-46
# nterms=2, curie, 4 epochs, curie:ft-personal-2022-11-30-08-22-34
# nterms=2, babbage, 4 epochs, babbage:ft-personal-2022-11-30-08-32-24
# nterms=2, ada, 4 epochs, ada:ft-personal-2022-11-30-08-42-32

# nterms=3, davinci, 4 epochs, davinci:ft-personal-2022-11-30-08-52-38
# nterms=3, curie, 4 epochs, curie:ft-personal-2022-11-30-09-15-04
# nterms=3, babbage, 4 epochs, babbage:ft-personal-2022-11-30-09-02-00
# nterms=3, ada, 4 epochs, ada:ft-personal-2022-11-30-09-12-47

In [168]:
def evaluate_gpt3(ft_model=None, nterms=2, eval_train=False):
    if not ft_model:
        raise ValueError('Must provide a fine-tuned model')
    test_dir = 'data/test/'
    test_file = test_dir + f'{nterms}/{nterms}_prepared.jsonl'
    test_data = pd.read_json(test_file, orient='records', lines=True)['prompt'].tolist()
    terms = np.array(make_possible_terms())
    n = len(test_data)
    rmses = []
    correct_cnt = 0
    for i in range(0, n, 20):
        curr_test = test_data[i:] if n-i < 20 else test_data[i:i+20]
        preds = openai.Completion.create(model=ft_model, prompt=curr_test, stop=[' <EOS>'], max_tokens=30, temperature=0)['choices']
        for j, obj in enumerate(preds):
            pred = obj['text']
            pred_mask = np.array(pred.strip().split(', '))=='True'
            pred_terms = terms[pred_mask]
            seq_list = np.array(test_data[i+j][:-3].split(', ')).astype(int)
            rmse = grid_search(seq_list, pred_terms)
            rmses.append(rmse)
            if rmse == 0:
                correct_cnt += 1
    print('Mean RMSE on test data:', np.mean(rmses))
    print('Percentage of examples solved perfectly:', correct_cnt/n)

In [169]:
evaluate_gpt3('davinci:ft-personal-2022-11-29-06-40-46', 3)

Mean RMSE on test data: 90.38050943280014
Percentage of examples solved perfectly: 0.075
