# Indexation workflow

The goal of this notebook is to walk through a traditional indexation workflow, which consists of: 

1. Extracting text from a document
2. Cleaning the text
3. Chunking the text into smaller pieces
4. Embedding the text
5. Indexing the text


# Extracing Text

First, we need to extract the text from the document. Here, we will utilize amazon Textract to extract the text from the document.

In [3]:
from textractor import Textractor
from textractor.data.constants import TextractFeatures
from textractor.data.text_linearization_config import TextLinearizationConfig

input_document = "../data/DCEE_Actions_Master_List_090920_final.pdf"

extractor = Textractor(profile_name="default")
config = TextLinearizationConfig(
    hide_figure_layout=True,
    title_prefix="# ",
    section_header_prefix="## "
)

documents = extractor.start_document_analysis(
	file_source=input_document,
	s3_upload_path="s3://greencompute",
	features=[TextractFeatures.LAYOUT],
	save_image=True,
)

print(documents.get_text(config=config))

S. DEPARTMENT OF ENERGY 

ffice of NERGY EFFICIENCY & ENEWABLE ENERGY 

# Data Center Master List of Energy Efficiency Measures 

Version 2.0 

September 2020 



Federal Energy Management Program


(This page intentionally left blank)


DOCUMENT TITLE 

## Disclaimer 

This document was prepared as an account of work sponsored by the United States Government. While this document is believed to contain correct information, neither the United States Government nor any agency thereof, nor The Regents of the University of California, nor any of their employees, makes any warranty, express or implied, or assumes any legal responsibility for the accuracy, completeness, or usefulness of any information, apparatus, product, or process disclosed, or represents that its use would not infringe privately owned rights. Reference herein to any specific commercial product, process, or service by its trade name, trademark, manufacturer, or otherwise, does not necessarily constitute or imply its endor

In [4]:
# write the text to a file
with open("output.txt", "w") as f:
	f.write(documents.get_text(config=config))

## Chunking Text

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter

In [8]:
# load huggingface tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en")

In [10]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(documents.document.get_text(config=config))

# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 100
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
).from_huggingface_tokenizer(tokenizer=tokenizer)

# Split
splits = text_splitter.split_documents(md_header_splits)
print(f"Created {len(splits)} splits")

Created 245 splits


## Embedding Text

In [11]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

  embeddings_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
docs = [split.page_content for split in splits]
embeddings = embeddings_model.embed_documents(docs)

In [14]:
len(embeddings)

245

## Save the embeddings

In [36]:
from sqlalchemy.orm import Session, DeclarativeBase, mapped_column
from sqlalchemy import Column, BigInteger, Text, Integer, DateTime
from pgvector.sqlalchemy import Vector
import datetime

from greencompute_backend.db.engine import engine

In [78]:
EMBEDDINGS_DIM = 384

class Base(DeclarativeBase):
    pass

class Document(Base):
    __tablename__ = "documents-new"

    id = Column(BigInteger, primary_key=True)
    embeddings = Column(Vector(EMBEDDINGS_DIM), nullable=True, name="embeddings@384")
    new_embeddings = Column(Vector(EMBEDDINGS_DIM), nullable=True, name="new_embeddings@786")
    title = Column(Text)
    url = Column(Text)
    content = Column(Text)
    tokens = Column(Integer)
    date_indexed = mapped_column(DateTime)


In [79]:
Base.metadata.create_all(engine)

In [42]:
with Session(bind=engine) as session:
	for embedding, document in zip(embeddings, splits):
		doc = Document(
			new_embeddings=embedding,
			title=input_document.split("/")[-1],
			content=document.page_content,
			url="https://greencompute.org",
			tokens=len(document.page_content.split()),
			date_indexed=datetime.datetime.now()
		)
		session.add(doc)
	session.commit()

In [46]:
from sqlalchemy import select

In [44]:
# Vectorize the query
query = "How can I increase my energy efficiency?"
query_embedding = embeddings_model.embed_query(query)
len(query_embedding)

384

In [67]:
result = session.scalars(select(Document).order_by(Document.embeddings.l2_distance(query_embedding).desc()).limit(5))

In [68]:
# Convert results to list of dictionaries
results = [r.__dict__ for r in result.fetchall()]

In [69]:
# Exclude the embeddings key from the results
for r in results:
	r.pop("embeddings")

In [66]:
results

[{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x3b59058b0>,
  'content': '## EEM 1-4.1: Upgrade All Cooling Supply Fan, Pump, and Cooling Tower Fan Motors to Higher Efficiency',
  'date_indexed': datetime.datetime(2024, 10, 11, 18, 54, 11, 920304),
  'url': 'https://greencompute.org',
  'id': 39,
  'title': 'DCEE_Actions_Master_List_090920_final.pdf',
  'tokens': 17},
 {'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x3b5905850>,
  'content': 'S. DEPARTMENT OF ENERGY  \nffice of NERGY EFFICIENCY & ENEWABLE ENERGY',
  'date_indexed': datetime.datetime(2024, 10, 11, 18, 54, 11, 917672),
  'url': 'https://greencompute.org',
  'id': 1,
  'title': 'DCEE_Actions_Master_List_090920_final.pdf',
  'tokens': 11},
 {'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x3b5905b50>,
  'content': '## Category 1: Data Center Energy Efficiency Management  \nData center energy efficiency management has several components. Start the management process when first c

In [72]:
# create a function for retrieval
def search(query: str, top_k: int = 10, keys: list = ["title", "content"]):
	query_embedding = embeddings_model.embed_query(query)
	result = session.scalars(select(Document).order_by(Document.embeddings.l2_distance(query_embedding).desc()).limit(top_k))
	results = [r.__dict__ for r in result.fetchall()]
	results = [{k: r[k] for k in keys} for r in results]
	return results

In [73]:
search("How can I increase my energy efficiency?")

[{'title': 'DCEE_Actions_Master_List_090920_final.pdf',
  'content': '## Disclaimer  \nThis document was prepared as an account of work sponsored by the United States Government. While this document is believed to contain correct information, neither the United States Government nor any agency thereof, nor The Regents of the University of California, nor any of their employees, makes any warranty, express or implied, or assumes any legal responsibility for the accuracy, completeness, or usefulness of any information, apparatus, product, or process disclosed, or represents that its use would not infringe privately owned rights. Reference herein to any specific commercial product, process, or service by its trade name, trademark, manufacturer, or otherwise, does not necessarily constitute or imply its endorsement, recommendation, or favoring by the United States Government or any agency thereof, or The Regents of the University of California. The views and opinions of authors expressed h