In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu

In [6]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()


True

In [13]:
# Get your API keys from openai, you will need to create an account. 
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
# os.environ["OPENAI_API_KEY"] =  "sk-*****"
openai_api_key = os.environ.get("OPENAI_API_KEY")


'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'

In [None]:
# # connect your Google Drive
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)
# root_dir = "/content/gdrive/My Drive/"

Mounted at /content/gdrive


In [15]:
# location of the pdf file/files. 
reader = PdfReader('docs/RestrictAct.pdf')

In [16]:
reader

<PyPDF2._reader.PdfReader at 0x12f266dd0>

In [17]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [None]:
# raw_text

In [18]:
raw_text[:100]

'II \n118 THCONGRESS \n1STSESSION  S. 686 \nTo authorize the Secretary of Commerce to review and prohibi'

In [19]:
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits. 

text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [20]:
len(texts)

88

In [21]:
texts[0]

'II \n118 THCONGRESS \n1STSESSION  S. 686 \nTo authorize the Secretary of Commerce to review and prohibit certain trans-\nactions between persons in the United States and foreign adversaries, and for other purposes. \nIN THE SENATE OF THE UNITED STATES \nMARCH 7, 2023 \nMr. W ARNER (for himself, Mr. T HUNE , Ms. B ALDWIN , Mrs. F ISCHER , Mr. \nMANCHIN , Mr. M ORAN , Mr. B ENNET , Mr. S ULLIVAN , Mrs. G ILLIBRAND , \nMs. C OLLINS , Mr. H EINRICH , Mr. R OMNEY , and Mrs. C APITO ) intro-\nduced the following bill; which was read twice and referred to the Com-mittee on Commerce, Science, and Transportation \nA BILL \nTo authorize the Secretary of Commerce to review and pro-\nhibit certain transactions between persons in the United States and foreign adversaries, and for other purposes. \nBe it enacted by the Senate and House of Representa- 1\ntives of the United States of America in Congress assembled, 2\nSECTION 1. SHORT TITLE. 3\nThis Act may be cited as the ‘‘Restricting the Emer- 4'

In [23]:
texts[1]

'Be it enacted by the Senate and House of Representa- 1\ntives of the United States of America in Congress assembled, 2\nSECTION 1. SHORT TITLE. 3\nThis Act may be cited as the ‘‘Restricting the Emer- 4\ngence of Security Threats that Risk Information and Com-5\nmunications Technology Act’’ or the ‘‘RESTRICT Act’’. 6\nSEC. 2. DEFINITIONS. 7\nIn this Act: 8\nVerDate Sep 11 2014 03:49 Mar 21, 2023 Jkt 039200 PO 00000 Frm 00001 Fmt 6652 Sfmt 6201 E:\\BILLS\\S686.IS S686pbinns on DSKJLVW7X2PROD with $$_JOB2 \n•S 686 IS(1) C LASSIFIED NATIONAL SECURITY INFORMA - 1\nTION .—The term ‘‘classified national security infor- 2\nmation’’ means information that has been deter- 3\nmined pursuant to Executive Order 13526 (50 4\nU.S.C. 3161 note; relating to classified national se- 5\ncurity information) or any predecessor or successor 6\norder, to require protection against unauthorized 7\ndisclosure, and is marked to indicate such classified 8\nstatus if in documentary form. 9'

In [22]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [24]:
docsearch = FAISS.from_texts(texts, embeddings)

In [25]:
docsearch

<langchain.vectorstores.faiss.FAISS at 0x14daba4d0>

In [26]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [27]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [28]:
query = "who are the authors of the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Mr. Warner, Mr. Thune, Ms. Baldwin, Mrs. Fischer, Mr. Manchin, Mr. Moran, Mr. Bennet, Mr. Sullivan, Mrs. Gillibrand, Ms. Collins, Mr. Heinrich, Mr. Romney, and Mrs. Capito.'

In [29]:
query = "what is the key takeaway from the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The key takeaway from the article is that the Secretary may designate any foreign government or regime as a foreign adversary if they are found to be engaging in a long-term pattern or serious instances of conduct significantly adverse to the national security of the United States.'

In [30]:
query = "Can this act by misused?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' No, this act cannot be misused. Any misuse of this act is an unlawful act and can be punished by a court as a contempt. The Attorney General may bring an action in an appropriate district court of the United States for appropriate relief against any person who violates this Act or any regulation, order, direction, mitigation measure, prohibition, or other authorization or directive issued under this Act.'

In [None]:
query = "what was the size of the training dataset?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The final training dataset contains 437,605 prompt-generation pairs.'

In [None]:
query = "How is this different from other models?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' This model is different from other models because it is based on LLaMA, it is licensed only for research purposes, and it is trained on a dataset of post-processed examples. It also has a TSNE visualization of the final training data, and a zoomed-in view to show generations related to personal health and wellness.'

In [None]:
query = "What is Google Bard?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" I don't know."