In [None]:
import os
from langchain.llms import OpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.indexes.vectorstore import VectorstoreIndexCreator, VectorStoreIndexWrapper
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import openai
import json
import time


# openai.api_key = ""
openai.api_key = ""
local_persist_path = './vector_store'

Experimental setup

In [None]:
main_file = '/root/LLM/LangChain/E-1-RM22-12-000.pdf'
assist_file_head = 'FERC_assist'
file_head = 'FERC'

# 1 none 2 PE 3 PE/assist

assist = False
use_PE = True
fewshot = False

RAG

In [None]:
def get_index_path(index_name):
    return os.path.join(local_persist_path, index_name)

# save separately
def load_pdf_and_save_to_index(file_path, index_name):
    loader = PyPDFLoader(file_path)
    index = VectorstoreIndexCreator(vectorstore_kwargs={'persist_directory': get_index_path(index_name)}).from_loaders({loader})
    index.vectorstore.persist()

# save together
def load_multiple_pdf_and_save_to_index(file_paths, index_name):
    loaders = [PyPDFLoader(file_path) for file_path in file_paths]
    print(loaders)
    index = VectorstoreIndexCreator(vectorstore_kwargs={'persist_directory': get_index_path(index_name)}).from_loaders(loaders)
    index.vectorstore.persist()

def load_index(index_name):
    index_path = get_index_path(index_name)
    embedding = OpenAIEmbeddings()
    vectordb = Chroma(persist_directory=index_path, embedding_function=embedding)
    return VectorStoreIndexWrapper(vectorstore=vectordb)

In [None]:
root = '/root/LLM/LangChain/assist_files/'
names = os.listdir(root)
# assist_files = [os.path.join(root, name) for name in names]
assist_files = [
    '/root/LLM/LangChain/assist_files/Chapter-10---Distribution-System_2017_Electrical-Power-Systems.pdf',
    '/root/LLM/LangChain/assist_files/Chapter-24---Renewable-Energy-Sources_2017_Electrical-Power-Systems.pdf',
    '/root/LLM/LangChain/assist_files/Chapter-20---Economic-Operation-of-Power-System_2017_Electrical-Power-System.pdf'
]

# load_pdf_and_save_to_index(main_file, 'main_file')

# for i in range(len(assist_files)):
#     if not os.path.exists(root + '{}_{}'.format(assist_file_head,names[i])):
#         load_pdf_and_save_to_index(assist_files[i], '{}_{}'.format(assist_file_head,names[i]))
#     time.sleep(20)

if assist:
    file_paths = [main_file] + assist_files
    load_multiple_pdf_and_save_to_index(file_paths, assist_file_head)
else:
    file_path = main_file
    load_pdf_and_save_to_index(file_path, 'FERC')

Assistance (only support for 1-3 files)

It seems like too many files will degrade the performance.

In [None]:
# assist_index_list = []

# if assist == True:
#     for n in os.listdir('vector_store/'):
#         if n.startswith(assist_file_head):
#             assist_index_list.append(n)

# if len(assist_index_list) == 0:
#     index_list = ['main_file']
# else:
#     index_list = ['main_file'] + assist_index_list

# index = load_index(index_list)

if assist:
    index = load_index('FERC_assist')
else:
    index = load_index('FERC')

Prompt engineering

In [None]:
style = 'experienced'
role = 'policy maker'
field = 'power system'
skill = 'summarize long text'
character_prompt = 'You are a {} {} in {}, who is good at {}. '.format(style, role, field, skill)

augmentation_prompt = 'Let us think step by step. '

question_prompt = 'Now, pay attention! My question is: '

include = 'as many technical details as possible. '
include_prompt = 'In your answer, you should include {}'.format(include)

# format_prompt = 'Your answer should be like this format: Answer: ...'
format_prompt = ''

request_prompt = 'Please do not copy! If applicable, you should not include too much original content in the file, unless the original content is strongly needed.'

few-shot (not available now)

In [None]:
fs_prompt = ''

QA experiments

In [None]:
general_question_list = [
        'Please summarize this file in 200 words.',
        'What is the structure of this file?',
        'Please give me several key words strongly related to this file.',
        'Please give me several key words strongly related to the content in this file. Note that, do not select words without meaning.',
]

technical_question_list = [
        'What is Synchronization?',
        'What is the Synchronization in this file?',
        'What is the Synchronization in this file? Please explain it in 200 words.',
        'Please summarize the main content of Synchronization (200) part in this file.',
        'Please give an example in real-world application of the synchronization mentioned in this file.',
        'Please tell me, in Reliability Standards in this file, which standard is the most useful one?'
]

In [None]:
data = {}
i = 1
for q in general_question_list:
    if use_PE and fewshot:

        if assist:
            file_name = "output_files/{}_PE_FS_AS_general_output.json".format(assist_file_head)
        else:
            file_name = "output_files/{}_PE_FS_general_output.json".format(assist_file_head)
    elif use_PE:
        prompt = character_prompt + augmentation_prompt + question_prompt + q + include_prompt + format_prompt + request_prompt

        if assist:
            file_name = "output_files/{}_PE_AS_general_output.json".format(assist_file_head)
        else:
            file_name = "output_files/{}_PE_general_output.json".format(assist_file_head)
    else:
        prompt = q

        if assist:
            file_name = "output_files/{}_AS_general_output.json".format(assist_file_head)
        else:
            file_name = "output_files/{}_general_output.json".format(assist_file_head)

    result = index.query_with_sources(prompt, chain_type='map_reduce')
    data.update({"question{}".format(i): result['question'], "answer{}".format(i): result['answer']})
    time.sleep(60)
    i += 1

with open(file_name, "w") as outfile:
    json.dump(data, outfile, indent=2)

In [None]:
data = {}
i = 1
for q in technical_question_list:
    if use_PE and fewshot:
        prompt = character_prompt + augmentation_prompt + fs_prompt + question_prompt + q + include_prompt + format_prompt + request_prompt

        if assist:
            file_name = "output_files/{}_PE_FS_AS_technical_output.json".format(assist_file_head)
        else:
            file_name = "output_files/{}_PE_FS_technical_output.json".format(assist_file_head)
    elif use_PE:
        prompt = character_prompt + augmentation_prompt + question_prompt + q + include_prompt + format_prompt + request_prompt

        if assist:
            file_name = "output_files/{}_PE_AS_technical_output.json".format(assist_file_head)
        else:
            file_name = "output_files/{}_PE_technical_output.json".format(assist_file_head)
    else:
        prompt = q

        if assist:
            file_name = "output_files/{}_AS_technical_output.json".format(assist_file_head)
        else:
            file_name = "output_files/{}_technical_output.json".format(assist_file_head)

    result = index.query_with_sources(prompt, chain_type='map_reduce')
    data.update({"question{}".format(i): result['question'], "answer{}".format(i): result['answer']})
    time.sleep(60)
    i += 1

with open(file_name, "w") as outfile:
    json.dump(data, outfile, indent=2)

Directly ask

In [None]:
# Transformation chain
from langchain.chains import TransformChain, LLMChain, SimpleSequentialChain
from langchain.llms import OpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain

loader = PyPDFLoader("/root/LLM/LangChain/E-1-RM22-12-000.pdf")
pages = loader.load()

#
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 100,
    length_function = len,
)

#
pages_split = loader.load_and_split(text_splitter=text_splitter)

def transform_func(inputs: dict) -> dict:
    text = inputs["text"]
    shortened_text = "\n\n".join(text.split("\n\n")[:3])
    return {"output_text": shortened_text}

# 1
transform_chain = TransformChain(
    input_variables=["text"], output_variables=["output_text"], transform=transform_func
)

paragraph = ''
for i in range(20):
    paragraph += pages_split[i].page_content
time.sleep(30)

In [None]:
data = {}
i = 0
for q in general_question_list:
    # 2
    if use_PE:
        template = character_prompt + augmentation_prompt + fs_prompt + question_prompt + q + "{output_text}." + include_prompt + format_prompt + request_prompt
    else:
        template = q + "{output_text}."
    prompt = PromptTemplate(input_variables=["output_text"], template=template)
    llm_chain = LLMChain(llm=OpenAI(), prompt=prompt)

    # 3
    sequential_chain = SimpleSequentialChain(chains=[transform_chain, llm_chain])
    result = sequential_chain.run(paragraph)
    data.update({"question{}".format(i): template, "answer{}".format(i): result})
    i += 1
    time.sleep(60)

file_name = "output_files/{}_general_output.json".format(assist_file_head)

with open(file_name, "w") as outfile:
    json.dump(data, outfile, indent=2)

In [None]:
data = {}
i = 0
for q in technical_question_list:
    # 2
    if use_PE:
        template = character_prompt + augmentation_prompt + fs_prompt + question_prompt + q + "{output_text}." + include_prompt + format_prompt + request_prompt
    else:
        template = q + "{output_text}."
    prompt = PromptTemplate(input_variables=["output_text"], template=template)
    llm_chain = LLMChain(llm=OpenAI(), prompt=prompt)

    # 3
    sequential_chain = SimpleSequentialChain(chains=[transform_chain, llm_chain])
    result = sequential_chain.run(paragraph)
    data.update({"question{}".format(i): template, "answer{}".format(i): result})
    i += 1
    time.sleep(60)

file_name = "output_files/{}_technical_output.json".format(assist_file_head)

with open(file_name, "w") as outfile:
    json.dump(data, outfile, indent=2)