![Header](https://raw.githubusercontent.com/eliyantojoko/pyjo_july_25/main/Header%20Notebook.png)


**Import Library**

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.pgvector import PGVector
from langchain.embeddings import HuggingFaceEmbeddings

# What is Retrieval Augmented Generation?

Retrieval-Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches in natural language processing (NLP), particularly in question answering or chatbot systems. It allows a language model to generate more accurate and up-to-date responses by first retrieving relevant information from an external knowledge source.

![RAG Diagram](https://raw.githubusercontent.com/eliyantojoko/pyjo_july_25/main/RAG%20Diagram.png)

## Store document in PGVector

In [45]:
# --- 1. Load PDF data ---
loader = PyPDFLoader("JOKO ELIYANTO - CV.pdf")
pages = loader.load()

In [46]:
# --- 2. Split them into a chunk ---
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(pages)

In [47]:
docs[:2]

[Document(metadata={'producer': 'Skia/PDF m140 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'JOKO ELIYANTO', 'source': 'JOKO ELIYANTO - CV.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='JOKO  ELIYANTO  \n \nData\n \nScientist\n \nhttps://github.com/jokoeliyanto \n \nIdentity  \nAddress  :  RT  04,  Seropan  3,  Muntuk,  Dlingo,  Bantul,  D.I.  Yogyakarta,  55783  \nMobile\n \n \n:\n \n+62\n \n821\n \n8311\n \n2655\n Email   :  jokoeliyanto@gmail.com Work  Email  :  joko@temandata.com GitHub  : https://github.com/jokoeliyanto  LinkedIn  : https://www.linkedin.com/in/joko-eliyanto-23a1b6143 \n \nProfessional  Summary'),
 Document(metadata={'producer': 'Skia/PDF m140 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'JOKO ELIYANTO', 'source': 'JOKO ELIYANTO - CV.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='Professional  Summary  \nResults-driven  Data  Scientist  with  3+  years  of  experience 

In [48]:
# --- 3. Load Embedding model ---
from transformers import AutoTokenizer, AutoModel
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.pgvector import PGVector
from langchain_core.embeddings import Embeddings
from typing import List

# --- Custom wrapper for LangChain Embeddings ---
class MultilingualEmbedding(Embeddings):
    def __init__(self, model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device=None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self._embed(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return self._embed(text)

    def _embed(self, text: str) -> List[float]:
        text = f"passage: {text}"  # For documents; use "query:" for queries
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0]  # CLS token
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        return embeddings[0].cpu().tolist()

In [49]:
embedding = MultilingualEmbedding()

In [53]:
# --- 4. Simpan ke Vector DB (PostgreSQL + pgvector) ---
vectorstore = PGVector.from_documents(
    documents=docs,
    embedding=embedding,
    collection_name="je_cv",
    connection_string="postgresql+psycopg2://postgres:1234567890@172.24.194.113:5432/postgres",
    use_jsonb=True,
)


## Retrieval Test

In [74]:
query = "alamat email Joko Eliyanto?"
n_results = 3

res_semantic = vectorstore.similarity_search_with_relevance_scores(
        query=query,
        k=n_results
    )

In [75]:
res_semantic

[(Document(metadata={'producer': 'Skia/PDF m140 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'JOKO ELIYANTO', 'source': 'JOKO ELIYANTO - CV.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='JOKO  ELIYANTO  \n \nData\n \nScientist\n \nhttps://github.com/jokoeliyanto \n \nIdentity  \nAddress  :  RT  04,  Seropan  3,  Muntuk,  Dlingo,  Bantul,  D.I.  Yogyakarta,  55783  \nMobile\n \n \n:\n \n+62\n \n821\n \n8311\n \n2655\n Email   :  jokoeliyanto@gmail.com Work  Email  :  joko@temandata.com GitHub  : https://github.com/jokoeliyanto  LinkedIn  : https://www.linkedin.com/in/joko-eliyanto-23a1b6143 \n \nProfessional  Summary'),
  0.4588559343476486),
  0.3324468832542249),
 (Document(metadata={'producer': 'Skia/PDF m140 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'JOKO ELIYANTO', 'source': 'JOKO ELIYANTO - CV.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='into\n \nactionable\n \ninsights\n \nthat\

## Perform RAG

In [69]:
from langchain.chat_models import ChatOpenAI  

llm = ChatOpenAI(
    openai_api_base="http://localhost:1234/v1",
    openai_api_key="lm-studio",  
    model="nama-model-dari-lm-studio"
)

In [70]:
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",  
    search_kwargs={
        "k": 3,                                
        "score_threshold": 0.3             
    }
)

In [71]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

In [73]:
query = "alamat email Joko Eliyanto?"
response = qa_chain.invoke({"query": query})

print(response["result"])

jokoeliyanto@gmail.com


### References 

https://medium.com/data-science/retrieval-augmented-generation-rag-from-theory-to-langchain-implementation-4e9bd5f6a4f2

https://python.langchain.com/docs/concepts/rag/

https://python.langchain.com/docs/concepts/retrievers/