# Load, split and embed Java best practices document

In [2]:
import pandas as pd
import numpy as np

## Best Practices Data Load

### From pdf

In [4]:
from langchain_community.document_loaders import PyPDFLoader

loader_pdf_bp = PyPDFLoader("best_practices/codeconventions-150003.pdf")
oracle_bp = loader_pdf_bp.load()

### From html

https://www.baeldung.com/java-clean-code

In [5]:
from langchain_community.document_loaders import UnstructuredHTMLLoader
loader_html_bp = UnstructuredHTMLLoader("best_practices/Baeldung.html")
baeldung_bp = loader_html_bp.load()

https://www.geeksforgeeks.org/java-best-practices/

In [6]:
loader_html_bp = UnstructuredHTMLLoader("best_practices/GeeksforGeeks.html")
geeks_bp = loader_html_bp.load()

https://blog.jetbrains.com/idea/2024/02/java-best-practices/

In [7]:
loader_html_bp = UnstructuredHTMLLoader("best_practices/IntelliJ-IDEA.html")
intelliJ_bp = loader_html_bp.load()

## Document Split

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [11]:
chunk_size = 800 
chunk_overlap = 80
bp_pdf_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
texts_oracle_bp = bp_pdf_splitter.split_documents(oracle_bp)
len(texts_oracle_bp)

58

In [12]:
texts_baeldung_bp = bp_pdf_splitter.split_documents(baeldung_bp)
len(texts_baeldung_bp)

37

In [14]:
texts_geeks_bp = bp_pdf_splitter.split_documents(geeks_bp)
len(texts_geeks_bp)

49

In [15]:
texts_intelliJ_bp = bp_pdf_splitter.split_documents(intelliJ_bp)
len(texts_intelliJ_bp)

29

### Joining the split texts

In [16]:
texts = texts_oracle_bp + texts_baeldung_bp + texts_geeks_bp + texts_intelliJ_bp
len(texts)

173

In [18]:
texts_nested = [texts_oracle_bp + texts_baeldung_bp + texts_geeks_bp + texts_intelliJ_bp] #nested

## Ollama Embedding and Chroma Vectorstore

In [17]:
from langchain_community.embeddings import OllamaEmbeddings
from tqdm import tqdm
embeddings = OllamaEmbeddings(model="gemma:7b-instruct", num_gpu = 2,   num_thread = 24, show_progress = True)

In [24]:
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(documents = texts, embedding = embeddings,
                           persist_directory="bp_chroma_db")

OllamaEmbeddings: 100%|███████████████████████| 173/173 [01:41<00:00,  1.70it/s]


In [25]:
retriever = db.as_retriever()