# Chat With Your Data

## Persist Data to Vector Stores

# Install libraries

In [None]:
pip install openai

In [None]:
pip install python-dotenv

In [None]:
pip install langchain

In [None]:
pip install langchain-openai

In [None]:
pip install pypdf

In [None]:
pip install faiss-cpu

In [None]:
pip install langchainhub

In [None]:
pip install langchain-community

## Load OpenAI API Key to use OpenAI's embedding model

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
OPENAI_API_KEY=os.environ['OPENAI_API_KEY']

## Load documents

In [3]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader('big-book-of-data-engineering.pdf')
pages = loader.load()

## Chunk documents

In [4]:
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

# Load the document, split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(pages)

# Generate embeddings and store in vector database
## FAISS vector database

In [5]:
from langchain_community.vectorstores import FAISS
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-3-small")
# Load it into the vector store and embed
vectordb = FAISS.from_documents(documents, embeddings )

In [6]:
print(vectordb.index.ntotal)

2


In [8]:
documents[1]

Document(metadata={'source': 'big-book-of-data-engineering.pdf', 'page': 1}, page_content='Challenges of data engineering in the AI era\nAs previously mentioned, data engineering is key to ensuring reliable data for \nAI initiatives. Data engineers who build and maintain ETL pipelines and the \ndata infrastructure that underpins analytics and AI workloads face specific \nchallenges in this fast-moving landscape. \n ■ Handling real-time data: From mobile applications to sensor data on \nfactory floors, more and more data is created and streamed in real \ntime and requires low-latency processing so it can be used in real-time \ndecision-making.\n ■ Scaling data pipelines reliably:  With data coming in large quantities \nand often in real time, scaling the compute infrastructure that runs \ndata pipelines is challenging, especially when trying to keep costs low \nand performance high. Running data pipelines reliably, monitoring data \npipelines and troubleshooting when failures occur are 

## Persist Data in your Vector Store

In [9]:
vectordb.save_local("faiss2b_index")

## Load Vector Store

In [10]:
new_db = FAISS.load_local("faiss2b_index", embeddings, allow_dangerous_deserialization=True)