In [105]:
import os
import openai
from llama_index import ServiceContext
from llama_index.llms import OpenAI

# Set up env.
1. set up the Open AI API key
2. move your PDF doc inside the ./data dir and set the path to the document

In [106]:
os.environ["OPENAI_API_KEY"] = "" # key
openai.api_key = os.environ["OPENAI_API_KEY"]
doc = "" # path to to doc

In [107]:
from pathlib import Path
from llama_hub.file.pdf.base import PDFReader
from llama_hub.file.unstructured.base import UnstructuredReader

In [128]:
loader = PDFReader()
docs0 = loader.load_data(file=Path(doc))

In [129]:
from llama_index import Document

doc_text = "\n\n".join([d.get_content() for d in docs0])
title = "Book Title"
metadata = {"book_title": title} # metadata get parsed into LLM at prompt time
docs = [Document(text=doc_text, metadata=metadata)]

To make sure the python script has access to the PDF, run the below cell. You should see the stringified content.

In [None]:
print(docs[0].get_content())

Feel free to use different models and edit the temperature

In [131]:
from llama_index.callbacks import CallbackManager

callback_manager = CallbackManager([])

gpt_35_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-0613", temperature=0.3),
    callback_manager=callback_manager,
)
gpt_4_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4-0613", temperature=0.3), callback_manager=callback_manager
)

# Generating Dataset
### Chunking PDF data
this step will chunk your PDF into a list of chunked text. The total number of inference ran will be the number of nodes (depend on the length of the article) * number of questions per chunk

### Synthetic data
after the original text is chunked, we will create a set of query/response questions based on the chunk texts.
Use either GPT3.5 or GPT4.

In [132]:
from llama_index.evaluation import DatasetGenerator
from llama_index.node_parser import SimpleNodeParser

In [None]:
node_parser = SimpleNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(docs)
print(nodes)
print(len(nodes)) 

In [None]:
from llama_index import SummaryIndex
import json

num_questions_per_chunk = 6 # this will set the number of questios per chunk
question_gen_query = (
    "You are an Expert Conversation Creator. Your task is to setup "
    f"an examination and queries on the topic of {title}. Using the provided context, "
    f"formulate {num_questions_per_chunk} queries that captures an important fact from the "
    "context. \n"
    "You MUST obey the following criteria:\n"
    "- Restrict the question to the context information provided.\n"
    "- Query must be something a novice who is new and uninitiated to the topic will ask.\n"
    "- Do NOT create a question that cannot be answered from the context.\n"
    "- Phrase the question in a way that is easy for someone to also have asked, ie. searchable on Google"
    "- Phrase the question so that it does NOT refer to specific context. "
    'For instance, do NOT put phrases like "given provided context" or "in this work" in the question, '
    "because if the question is asked elsewhere it wouldn't be provided specific context. Replace these terms "
    "with specific details.\n"
    "BAD questions:\n"
    "What did the author do in his childhood\n"
    "What were the main findings in this report\n\n"
    "GOOD questions:\n"
    "What did Barack Obama do in his childhood\n"
    "What were the main findings in the original Transformers paper by Vaswani et al.\n\n"
    "Generate the questions below:\n"
)

fp = open("data/qa_pairs.jsonl", "w")
for idx, node in enumerate(nodes):
    dataset_generator = DatasetGenerator(
        [node],
        question_gen_query=question_gen_query,
        service_context=gpt_35_context,
        metadata_mode="all",
    )
    node_questions_0 = dataset_generator.generate_questions_from_nodes(num=10)
    print(f"[Node {idx}] Generated questions:\n {node_questions_0}")
    # for each question, get a response
    for question in node_questions_0:
        index = SummaryIndex([node], service_context=gpt_35_context)
        query_engine = index.as_query_engine()
        response = query_engine.query(question)
        out_dict = {"query": question, "response": str(response)}
        print(f"[Node {idx}] Outputs: {out_dict}")
        fp.write(json.dumps(out_dict) + "\n")

fp.close()

# Convert Synthetic Data to Open AI messages
Open AI finetunning model will require data to follow the Open AI messaging format:
[system_message, user_input, assistant_output]

In [139]:
fp = open("data/qa_pairs.jsonl", "r")
out_fp = open("data/qa_pairs_openai.jsonl", "w")
system_prompt = {
    "role": "system",
    "content": f"You are a helpful assistant to help me answer questions on the topic of {title}",
}
for line in fp:
    qa_pair = json.loads(line)
    user_prompt = {"role": "user", "content": qa_pair["query"]}
    assistant_prompt = {"role": "assistant", "content": qa_pair["response"]}
    out_dict = {
        "messages": [system_prompt, user_prompt, assistant_prompt],
    }
    out_fp.write(json.dumps(out_dict) + "\n")

In [140]:
from llama_index.finetuning import OpenAIFinetuneEngine


# Fine tunning New vs. Existing model
the below param uses 3.5 Turbo for fine tunning, which is currently supported. Once you have an updated model ID from the following steps, you can replace this param with the model ID to train model further on new data sets.

In [141]:
# currently OpenAI only supports 3.5 Turbo fine tunning. Once you have an updated model ID from the following steps, you can replace this param with the model ID to train model further
finetune_engine = OpenAIFinetuneEngine(
    "gpt-3.5-turbo", 
    "data/qa_pairs_openai.jsonl",
)


In [None]:
finetune_engine.finetune()

In [None]:
finetune_engine.get_current_job()


# Post Fine-tune
once the job is done, you should receive an email update. You should be able to get the corresponding model ID.

In [144]:
ft_model = finetune_engine.get_finetuned_model()

In [None]:
ft_model