In [29]:
!pip install apify-client
!pip install faiss-cpu
!pip install faiss-gpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp312-cp312-win_amd64.whl.metadata (3.8 kB)
Downloading faiss_cpu-1.8.0-cp312-cp312-win_amd64.whl (14.5 MB)
   ---------------------------------------- 0.0/14.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.5 MB 660.6 kB/s eta 0:00:22
   ---------------------------------------- 0.1/14.5 MB 1.8 MB/s eta 0:00:09
   - -------------------------------------- 0.5/14.5 MB 5.2 MB/s eta 0:00:03
   -- ------------------------------------- 0.9/14.5 MB 7.1 MB/s eta 0:00:02
   --- ------------------------------------ 1.3/14.5 MB 7.6 MB/s eta 0:00:02
   ---- ----------------------------------- 1.5/14.5 MB 8.2 MB/s eta 0:00:02
   ----- ---------------------------------- 2.1/14.5 MB 9.0 MB/s eta 0:00:02
   ------ --------------------------------- 2.5/14.5 MB 9.3 MB/s eta 0:00:02
   ------- -------------------------------- 2.9/14.5 MB 9.2 MB/s eta 0:00:02
   -------- ------------------------------- 3.3/14.5 MB 9.4 MB/s eta 0:00:02

In [25]:
from langchain.docstore.document import Document
from langchain_community.document_loaders import ApifyDatasetLoader
from dotenv import load_dotenv
from langchain.indexes import VectorstoreIndexCreator

In [27]:

load_dotenv()

loader = ApifyDatasetLoader(
    dataset_id="DJeyL8l2emCsSTtI1",
    dataset_mapping_function=lambda dataset_item: Document(
        page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
    ),
)


In [28]:
index = VectorstoreIndexCreator().from_loaders([loader])

KeyboardInterrupt: 

In [None]:
# Query the vector store
query = "What is the seventh item in the OWASP Top 10?"
result = index.query(query)
print(result)

In [30]:
docs = loader.load()

In [32]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [33]:
embeddings = OpenAIEmbeddings()
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
vector = FAISS.from_documents(documents, embeddings)

In [34]:
# test retrieval
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain


llm = ChatOpenAI()

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [39]:

retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [37]:
response = retrieval_chain.invoke({"input": "What is the seventh item in the OWASP Top 10 in 2017? and what is it in 2021"})
response

{'input': 'What is the seventh item in the OWASP Top 10 in 2017? and what is it in 2021',
 'context': [Document(page_content='OWASP Top Ten 2017\nIntroduction\n\tLanguages: [en] de\t\nWelcome to the OWASP Top 10 - 2017!\nThis major update adds several new issues, including two issues selected by the community - A8:2017-Insecure Deserialization and A10:2017-Insufficient Logging & Monitoring. Two key differentiators from previous OWASP Top 10 releases are the substantial community feedback and extensive data assembled from dozens of organizations, possibly the largest amount of data ever assembled in the preparation of an application security standard. This provides us with confidence that the new OWASP Top 10 addresses the most impactful application security risks currently facing organizations.\nThe OWASP Top 10-2017 is based primarily on 40+ data submissions from firms that specialize in application security and an industry survey that was completed by over 500 individuals. This data 

In [41]:
vector.save_local("../vector_store/owasp_faiss")