In [1]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
import tiktoken
# Get your API keys from openai, you will need to create an account. 
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Access the OPENAI_API_KEY environment variable
openai_api_key = os.environ.get("OPENAI_API_KEY")
openai_api_type = os.environ.get("OPENAI_API_TYPE")
openai_api_base = os.environ.get("OPENAI_API_BASE")
openai_api_version = os.environ.get("OPENAI_API_VERSION")

print(f"Open AI key {openai_api_key}")
print(f"Open AI type {openai_api_type}")
print(f"Open AI base {openai_api_base}")
print(f"Open AI version {openai_api_version}")

Open AI key d38039a975024e7294c342b208e8051c
Open AI type azure
Open AI base https://caztonaieast.openai.azure.com/
Open AI version 2023-03-15-preview


In [3]:
# location of the pdf file/files. 

def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

raw_text=get_pdf_text(['docs/RestrictAct.pdf', 'docs/Orca.pdf'])
# reader = PdfReader('docs/RestrictAct.pdf')

# reader

# # read data from the file and put them into a variable called raw_text
# raw_text = ''
# for i, page in enumerate(reader.pages):
#     text = page.extract_text()
#     if text:
#         raw_text += text

# # raw_text

raw_text



'II \n118 THCONGRESS \n1STSESSION  S. 686 \nTo authorize the Secretary of Commerce to review and prohibit certain trans-\nactions between persons in the United States and foreign adversaries, and for other purposes. \nIN THE SENATE OF THE UNITED STATES \nMARCH 7, 2023 \nMr. W ARNER (for himself, Mr. T HUNE , Ms. B ALDWIN , Mrs. F ISCHER , Mr. \nMANCHIN , Mr. M ORAN , Mr. B ENNET , Mr. S ULLIVAN , Mrs. G ILLIBRAND , \nMs. C OLLINS , Mr. H EINRICH , Mr. R OMNEY , and Mrs. C APITO ) intro-\nduced the following bill; which was read twice and referred to the Com-mittee on Commerce, Science, and Transportation \nA BILL \nTo authorize the Secretary of Commerce to review and pro-\nhibit certain transactions between persons in the United States and foreign adversaries, and for other purposes. \nBe it enacted by the Senate and House of Representa- 1\ntives of the United States of America in Congress assembled, 2\nSECTION 1. SHORT TITLE. 3\nThis Act may be cited as the ‘‘Restricting the Emer- 4\n

In [4]:
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits. 

text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

len(texts)

texts[0]



'II \n118 THCONGRESS \n1STSESSION  S. 686 \nTo authorize the Secretary of Commerce to review and prohibit certain trans-\nactions between persons in the United States and foreign adversaries, and for other purposes. \nIN THE SENATE OF THE UNITED STATES \nMARCH 7, 2023 \nMr. W ARNER (for himself, Mr. T HUNE , Ms. B ALDWIN , Mrs. F ISCHER , Mr. \nMANCHIN , Mr. M ORAN , Mr. B ENNET , Mr. S ULLIVAN , Mrs. G ILLIBRAND , \nMs. C OLLINS , Mr. H EINRICH , Mr. R OMNEY , and Mrs. C APITO ) intro-\nduced the following bill; which was read twice and referred to the Com-mittee on Commerce, Science, and Transportation \nA BILL \nTo authorize the Secretary of Commerce to review and pro-\nhibit certain transactions between persons in the United States and foreign adversaries, and for other purposes. \nBe it enacted by the Senate and House of Representa- 1\ntives of the United States of America in Congress assembled, 2\nSECTION 1. SHORT TITLE. 3\nThis Act may be cited as the ‘‘Restricting the Emer- 4'

In [5]:
texts[-1]

'on Learning Representations , 2023.\n[35]Reiichiro Nakano, Jacob Hilton, Suchir Balaji, Jeff Wu, Long Ouyang, Christina Kim, Christo-\npher Hesse, Shantanu Jain, Vineet Kosaraju, William Saunders, et al. Webgpt: Browser-assisted\nquestion-answering with human feedback. arXiv preprint arXiv:2112.09332 , 2021.\n[36]Auto-gpt: An autonomous gpt-4 experiment. https://github.com/Significant-Gravitas/\nAuto-GPT , 2023. [Online; accessed 13-May-2023].\n[37]Prometheus: Building the new bing. https://blogs.bing.com/search-quality-insights/\nfebruary-2023/Building-the-New-Bing , 2023. [Online; accessed 4-June-2023].\n[38]Binfeng Xu, Zhiyuan Peng, Bowen Lei, Subhabrata Mukherjee, Yuchen Liu, and Dongkuan Xu.\nRewoo: Decoupling reasoning from observations for efficient augmented language models, 2023.\n51'

In [6]:
# Download embeddings from OpenAI
embeddings_model = "CaztonEmbedAda2"
tokenizer = tiktoken.get_encoding("cl100k_base")

# add embeddings model to it and then create a vector store
embeddings = OpenAIEmbeddings(
    deployment = embeddings_model,
    chunk_size = 1)


docsearch = FAISS.from_texts(texts, embeddings)

docsearch


<langchain.vectorstores.faiss.FAISS at 0x7fe92903f0a0>

In [7]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

gpt3_model = "CaztonDavinci3"

chain = load_qa_chain(OpenAI(engine=gpt3_model, temperature=0), chain_type="stuff")

query = "who are the authors of the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)




                    engine was transferred to model_kwargs.
                    Please confirm that engine is what you intended.


' Subhabrata Mukherjee, Arindam Mitra, Ganesh Jawahar, Sahaj Agarwal, Hamid Palangi, Ahmed Awadallah.'

In [8]:
# Start an infinite loop to continuously ask questions
while True:
    # Prompt the user to enter a question
    query = input(" Question (or type 'exit' to quit): ")
    
    # Check if the user wants to exit the loop
    if query.lower() == 'exit':
        break

    # Perform similarity search using the query
    docs = docsearch.similarity_search(query)
    
    # Run the question-answering chain
    response = chain.run(input_documents=docs, question=(query))
    
    # Print the response
    print(response)

# Exit message
print("Exiting the question-answering loop.")

 Subhabrata Mukherjee, Arindam Mitra, Ganesh Jawahar, Sahaj Agarwal, Hamid Palangi, and Ahmed Awadallah.
 The key takeaways from this research are that GPT-4 prefers longer responses over shorter ones, and has a bias in the order of the candidate responses. Additionally, the research suggests that auto-evaluation measures may overestimate the abilities of smaller models compared to LFMs, as the former are much weaker in comprehension and reasoning skills. Finally, the research proposes explanation tuning, which augments query-response pairs with detailed responses from GPT-4 that explain the reasoning process of the teacher as it generates the response.


InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.