# Data Pipeline
The goal of this notebook is to help you take a large PDF and prepare it to be queried by a large language model. This practice is called retreival augmented generation.

Serves the following primary functions:

* Extract data from PDF documents using Unstructured.io 
* Use data to create Llama Index nodes
* Create vector embeddings for data and store in Pinecone Vector Database (you could sub in your own database here)
* Create Index for data using Llama Index
* Save index to local storage to use with your query engine (query engine provided in other files in this repo)

In [None]:
#after installing packages from the requirements file import packages
#IMPORTANT - unstructured with detectron (used for PDF parsing) does not run easily on Windows - recommend using Google Colab if you are window user
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from llama_index import Document
from llama_index.node_parser import SimpleNodeParser
from llama_index.vector_stores import PineconeVectorStore
from llama_index import GPTVectorStoreIndex, StorageContext, ServiceContext
from llama_index.embeddings.openai import OpenAIEmbedding
import pinecone
import os
import openai

In [None]:
os.environ['OPENAI_API_KEY'] = 'INSERT YOUR API KEY HERE'
os.environ['PINECONE_API_KEY'] = 'INSERT YOUR API KEY HERE' #can create free pod (first is free) here https://www.pinecone.io/
os.environ['PINECONE_ENVIRONMENT'] = 'gcp-starter' #update with the environment for your Pinecone Vector DB

Extract data from pdf with unstructured.io

In [None]:
elements = partition_pdf("INSERT FILE NAME", include_page_breaks=True) #more information on this here https://unstructured-io.github.io/unstructured/core/partition.html

In [None]:
elements[0].text

In [None]:
#Add Pages
current_page = 1
for element in elements:
    if element.category == "PageBreak":
        current_page += 1
    else:
        element.page_number = current_page

In [None]:
# Remove Page Breaks (no longer necessary now that page numbers are added)
elements = [element for element in elements if element.category != "PageBreak"]

In [None]:
#Chunk Text 
chunks = chunk_by_title(elements) #more information on this here https://unstructured-io.github.io/unstructured/core/chunking.html

Create Documents

In [None]:
docs_raw = []
for i, chunk in enumerate(chunks):
    # Assuming each element has a 'text' attribute
    # and using the filename and loop index as the doc id
    # extra_info can be populated with any additional information you want from the element
    doc_id = f"{chunk.metadata.filename}_{i}"
    docs_raw.append(Document(
        text=chunk.text,
        doc_id=doc_id,
        extra_info={
            'page': chunk.page_number,
            'filename': chunk.metadata.filename
        }  # example, add more as needed
    ))

In [None]:
docs_raw[0]

In [None]:
len(docs_raw)

Create Nodes

In [None]:
parser = SimpleNodeParser()

nodes = parser.get_nodes_from_documents(docs_raw) #more info on nodes https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/root.html

Create and/or Initialize Pinecone Vector Database

In [None]:
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment=os.environ['PINECONE_ENVIRONMENT']
)

In [None]:
index_name = 'INSERT INDEX NAME'

In [None]:
#optional delete index code
#pinecone.delete_index(index_name)

In [None]:
#Create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric='cosine'
    )

In [None]:
pinecone_index = pinecone.Index(index_name)
namespace = '' # default namespace
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

Create Index

In [None]:
# setup our storage (vector db)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store
)
# setup the index/query process, ie the embedding model (and completion if used)
embed_model = OpenAIEmbedding(model='text-embedding-ada-002', embed_batch_size=100)
service_context = ServiceContext.from_defaults(embed_model=embed_model)

index = GPTVectorStoreIndex.from_documents(
    docs_raw, storage_context=storage_context,
    service_context=service_context
)
#more info on indexes here https://docs.llamaindex.ai/en/stable/module_guides/indexing/index_guide.html

Save Index Locally

In [None]:
index.storage_context.persist()