In [None]:
import json
import openai
import datetime
import obsidiantools.api as otools
from pathlib import Path
import os

In [None]:
wkd = Path(os.getcwd()).parent.parent.parent
vault = otools.Vault(wkd).connect(show_nested_tags=True).gather()

corpus = []
for k, v in vault.readable_text_index.items():
    corpus.append(v)


In [None]:
out_file_name = f"./fine_tune_{datetime.date.today().strftime('%d_%m_%Y')}.jsonl"

for e in corpus:
    with open(out_file_name, "a+") as outfile:
        json.dump({
            "prompt": f"",
            "completion": f" {e} END",
        }, outfile)
        outfile.write('\n')

In [None]:
!head -n3 $out_file_name | jq .

In [None]:
!echo "execute: openai tools fine_tunes.prepare_data -f $(pwd)/$out_file_name"

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
train, test = train_test_split(
    open(f"{out_file_name.replace('.jsonl', '_prepared.jsonl')}").read().splitlines(), test_size=0.1)
with open(f"{out_file_name.replace('.jsonl', '')}_prepared_train.jsonl", "w") as outfile:
    outfile.write("\n".join(train))
with open(f"{out_file_name.replace('.jsonl', '')}_prepared_test.jsonl", "w") as outfile:
    outfile.write("\n".join(test))

In [None]:
!head -n3 {out_file_name.replace('.jsonl', '')}_prepared_train.jsonl | jq .

In [None]:
from openai.wandb_logger import WandbLogger
import wandb
import re
values = open(".env", "r").read()
wandb_key = re.findall(r"WANDB_KEY=\"(.*)\"", values)[0]
wandb.login(key=wandb_key, relogin=True)
openai.api_key = re.findall(r"OPENAI_API_KEY=\"(.*)\"", values)[0]
openai.organization = re.findall(r"OPENAI_ORGANIZATION=\"(.*)\"", values)[0]

In [None]:
train_file = openai.File.create(
  file=open(f"{out_file_name.replace('.jsonl', '')}_prepared_train.jsonl", "rb"),
  purpose="fine-tune"
)
valid_file = openai.File.create(
  file=open(f"{out_file_name.replace('.jsonl', '')}_prepared_test.jsonl", "rb"),
  purpose="fine-tune"
)
ft = openai.FineTune.create(
    training_file=train_file["id"],
    validation_file=valid_file["id"],
    model="curie",
)

In [None]:
WandbLogger.sync(
    id=ft["id"],
    project="obsidian-openai",
    tags=["generation"],
)

In [None]:
import re
import openai
import datetime
values = open(".env", "r").read()
openai.api_key = re.findall(r"OPENAI_API_KEY=\"(.*)\"", values)[0]
openai.organization = re.findall(r"OPENAI_ORGANIZATION=\"(.*)\"", values)[0]
model = "curie:ft-personal-2022-09-17-07-49-44"
outputs = []
async def comp():
    response = openai.Completion.create(
      model=model,
      prompt="",
      temperature=0.7,
      max_tokens=256,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0,
      stop=["END"]
    )
    outputs.append(response["choices"][0]["text"])
import asyncio
# generate 100 samples into a file
await asyncio.gather(*[comp() for _ in range(10)])
  
with open(f"./{model}_{datetime.date.today().strftime('%d_%m_%Y')}.jsonl", "w") as outfile:
  outfile.write(("\n"+("-"*5)+"\n").join(outputs))