# Ingest data

## Solution: Implement a Vector Store and Load Data

# Install libraries

In [1]:
%pip install -q -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

## Load PDF documents

In [3]:
folder_path = '../../docs'
pages = [d for loader in [PyPDFLoader(os.path.join(folder_path, file_name)) 
         for file_name in os.listdir(folder_path)]
         for d in loader.load()]

In [4]:
len(pages)

13

## Split PDF documents into chunks

In [5]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [6]:
docs = text_splitter.split_documents(pages)

In [7]:
len(docs)

25

## Convert chunks to embeddings and store in FAISS vector database

In [8]:
_ = load_dotenv(find_dotenv()) # read local .env file

In [9]:
OPENAI_API_KEY=os.environ['OPENAI_API_KEY']

In [10]:

embeddings_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-3-small")

In [11]:
vectordb = FAISS.from_documents(docs, embeddings_model)
print(vectordb.index.ntotal)

25


## Persist the vector database to disk

In [13]:
!rm -rf summarizer_index
vectordb.save_local("summarizer_index")