In [9]:
import os
from tqdm import tqdm
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

Connect to Ollama and Embed

In [2]:
!ollama list
!curl -X POST http://localhost:11434/api/embed -H "Content-Type: application/json" -d "{\"model\": \"deepseek-r1:1.5b\", \"prompt\": \"Why is the sky blue?\"}"

NAME                        ID              SIZE      MODIFIED   
mxbai-embed-large:latest    468836162de7    669 MB    8 days ago    
deepseek-r1:7b              0a8c26691023    4.7 GB    8 days ago    
deepseek-r1:1.5b            a42b25d8c10a    1.1 GB    8 days ago    
llama2:latest               78e26419b446    3.8 GB    8 days ago    
nomic-embed-text:latest     0a109f422b47    274 MB    8 days ago    


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100    63    0     0  100    63      0     51  0:00:01  0:00:01 --:--:--    51
100    63    0     0  100    63      0     28  0:00:02  0:00:02 --:--:--    28
100   107  100    44  100    63     14     21  0:00:03  0:00:02  0:00:01    36
100   107  100    44  100    63     14     21  0:00:03  0:00:02  0:00:01    36


{"model":"deepseek-r1:1.5b","embeddings":[]}


In [10]:
EMBEDDING_MODEL = OllamaEmbeddings(
    model="mxbai-embed-large",
    base_url="http://127.0.0.1:11434"
) 
DOCUMENT_VECTOR_DB = InMemoryVectorStore(EMBEDDING_MODEL)  # Vector database

# Splits the document into chunks
def chunk_documents(raw_documents):
    text_processor = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True
    )
    return text_processor.split_documents(raw_documents)

# embed a given document
def embed(filed_path):
    # Loading into PDFPlumber
    raw_docs = PDFPlumberLoader(filed_path).load()
    
    # Chunk and index
    processed_chunks = chunk_documents(raw_docs)
    DOCUMENT_VECTOR_DB.add_documents(processed_chunks)
    return len(processed_chunks)
    
def simple(folder_path):
    # Get all PDF files in the folder
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdf")]
   
    # Process each PDF file
    for file_path in tqdm(pdf_files):
        embed(file_path)

def retrieve(query,search_type="similarity", k=5):
    # Retrieve related documents with similarity search
    relevant_docs = DOCUMENT_VECTOR_DB.search(query,search_type=search_type, k=k)
    return relevant_docs

In [11]:
%time
folder_path = r"C:/Users/gabreu/Desktop/MyRAG/data"
simple(folder_path)

CPU times: total: 0 ns
Wall time: 0 ns


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [05:29<00:00, 54.88s/it]


In [13]:
%%time
# Query
retrieve("Outline any discussion on Vistra from different managers")

CPU times: total: 15.6 ms
Wall time: 187 ms


[Document(id='e692f0f7-a180-43a5-b9f1-ac85684310bd', metadata={'source': 'C:/Users/gabreu/Desktop/MyRAG/data\\third-point-q1-2024-investor-letter_tpil.pdf', 'file_path': 'C:/Users/gabreu/Desktop/MyRAG/data\\third-point-q1-2024-investor-letter_tpil.pdf', 'page': 2, 'total_pages': 12, 'Author': 'Monique Devos', 'Creator': 'Microsoft® Word for Microsoft 365', 'CreationDate': "D:20240430115608-04'00'", 'ModDate': "D:20240430115608-04'00'", 'Producer': 'Microsoft® Word for Microsoft 365', 'start_index': 795}, page_content='company TXU in 2014, have become commonplace in the sector over the last decade.\nIn response to this challenging environment, we believe Vistra’s capital allocation strategy\nhas been brilliant. The company shut down unprofitable coal plants to improve its carbon\nfootprint and mitigate oversupply. Given the market was valuing its remaining gas assets at\npennies on the dollar relative to the cost of new builds, management patiently invested in\nmaintaining the existing 

Experiment: retrieve the correct chunck

In [11]:
# Experiment
chunk_id = retrieve("William Song, left the firm earlier in February")[0].id

search_types = ["similarity", "mmr"]

queries = ["Who left?",
           "Who left the firm in February?",
           "Who left the firm?",
           "Who left third point?",
           "Who left third point in February?"]

# experiment for different promprs and search methods
for s in search_types:
    print(s)
    for q in queries:
        ids = [doc.id for doc in retrieve(q, s, 2)]
        print(chunk_id in ids)

similarity
True
True
True
True
True
mmr
False
False
False
False
False


In [14]:
# Try max marginal relevance
for q in queries:
    ids = [doc.id for doc in retrieve_max(q, 3)]
    print(chunk_id in ids)

False
False
False
False
False
