In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# !pip install -qU \
#     langchain \
#     langchain-community \
#     langchain-huggingface \
#     faiss-cpu \
#     pypdf \
#     sentence-transformers \
#     huggingface_hub \
#     langchain-google-genai \
#     tf-keras \
#     hf_xet \
#     torch

In [3]:
import os
import torch
from pathlib import Path
from typing import List

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import SystemMessage, HumanMessage

from dotenv import load_dotenv
load_dotenv()




True

In [4]:
print("Loading Models...")

chat_llm = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen2.5-7B-Instruct",
    task="text-generation",
    temperature=0.65,
    max_new_tokens=1024,
    top_p=0.92,
    repetition_penalty=1.05
)

model = ChatHuggingFace(llm=chat_llm)

Loading Models...


In [5]:
print("Loading PDFs...")

PDF_FOLDER = "./RAG_Documents"
CHUNK_SIZE = 850
CHUNK_OVERLAP = 120

loader = PyPDFDirectoryLoader(PDF_FOLDER)
docs = loader.load()

print(f"→ Loaded {len(docs)} pdf pages")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""],
    add_start_index=True
)

chunks = text_splitter.split_documents(docs)
print(f"→ Created {len(chunks)} chunks")

Loading PDFs...
→ Loaded 5 pdf pages
→ Created 10 chunks


In [6]:
print("Creating embeddings... (this may take a few minutes)")

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}
)

print("→ FAISS index created!")

Creating embeddings... (this may take a few minutes)
→ FAISS index created!


In [7]:
topic = "Conscientiousness"

system_message = """You are talented graphalogy expert who has PHD in it. You are best in reading and analysing the handwritting. You are a helpful assistant. Answer ONLY from the provided transcript context. If the context is insufficient, just say you don't know."""

question = "Explain the detailed analysis on shared topic"

docs = retriever.invoke(f"Explain Summary, Writing-style descriptions, Graphology-style Overall impression on given topic : {topic}")
context = "\n\n".join(f"[Document {i+1}]\n{doc.page_content}\n" for i, doc in enumerate(docs))

In [8]:
# Create RAG prompt template
rag_prompt = ChatPromptTemplate.from_messages([
    ("system", "{system_message}"),
    ("human", """Context information:\n\n{context}\n\nQuestion:\n\n{question}\n\nTopic:{topic}\n\nAnswer:""")
])

simple_rag_chain = (
    rag_prompt
    | model
    | StrOutputParser()
)

# Just invoke with dictionary of the three variables
answer = simple_rag_chain.invoke({
    "system_message": system_message, # or just pass topic and build inside
    "context": context,
    "question": question,
    "topic": topic
})

print("Answer:")
print("-" * 60)
print(answer)
print("-" * 60)

Answer:
------------------------------------------------------------
The detailed analysis for Conscientiousness is as follows:

1. **Writing Style Descriptions**: The writing is structured, clear, and detail-oriented. Sentences are well-organized, grammatically careful, and goal-focused. This indicates a strong emphasis on planning, responsibility, accuracy, and logical progression of ideas.

2. **Gaphology-Style Overall Impression**: The writing is controlled, disciplined, and task-focused. This suggests a person who is highly organized, dependable, and goal-oriented.

3. **Slant**: The slant is vertical or slightly right. This indicates emotional control, objectivity, and self-regulation, traits that are consistent with high conscientiousness.

4. **Pressure**: The pressure is firm and consistent, which indicates determination, reliability, and a strong sense of duty. This reflects a person who is committed and persistent in their tasks.

5. **Spacing**: The spacing is tight but reg