In [1]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

ModuleNotFoundError: No module named 'langchain_core'

Connect to Ollama and Embed

In [2]:
!curl -X POST http://localhost:11434/api/embed -H "Content-Type: application/json" -d "{\"model\": \"deepseek-r1:1.5b\", \"prompt\": \"Why is the sky blue?\"}"

{"model":"deepseek-r1:1.5b","embeddings":[]}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100    63    0     0  100    63      0     51  0:00:01  0:00:01 --:--:--    51
100    63    0     0  100    63      0     28  0:00:02  0:00:02 --:--:--    28
100   107  100    44  100    63     16     23  0:00:02  0:00:02 --:--:--    40
100   107  100    44  100    63     16     23  0:00:02  0:00:02 --:--:--    40


In [3]:
!ollama list

NAME                        ID              SIZE      MODIFIED     
mxbai-embed-large:latest    468836162de7    669 MB    16 hours ago    
deepseek-r1:1.5b            a42b25d8c10a    1.1 GB    17 hours ago    
deepseek-r1:7b              0a8c26691023    4.7 GB    17 hours ago    
llama2:latest               78e26419b446    3.8 GB    17 hours ago    
nomic-embed-text:latest     0a109f422b47    274 MB    17 hours ago    


In [4]:
EMBEDDING_MODEL = OllamaEmbeddings(
    model="mxbai-embed-large",
    base_url="http://127.0.0.1:11434"
) 
DOCUMENT_VECTOR_DB = InMemoryVectorStore(EMBEDDING_MODEL)  # Vector database

# Splits the document into chunks
def chunk_documents(raw_documents):
    text_processor = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True
    )
    return text_processor.split_documents(raw_documents)

# embed a given document
def embed(filed_path):
    # Loading into PDFPlumber
    raw_docs = PDFPlumberLoader(filed_path).load()
    
    # Chunk and index
    processed_chunks = chunk_documents(raw_docs)
    DOCUMENT_VECTOR_DB.add_documents(processed_chunks)
    return len(processed_chunks)

In [5]:
def retrieve(query,search_type="similarity", k=1):
    # Retrieve related documents with similarity search
    relevant_docs = DOCUMENT_VECTOR_DB.search(query,search_type=search_type, k=k)
    return relevant_docs

def retrieve_max(query, k=1):
    # Retrieve related documents with similarity search
    relevant_docs = DOCUMENT_VECTOR_DB.max_marginal_relevance_search(query, k=k)
    return relevant_docs

In [6]:
%%time

# Embed the docs
filed_path = r"C:/Users/gabreu/Desktop/MyRAG/data/tpil-q4-2023-investor-letter.pdf"
embed(filed_path)

CPU times: total: 734 ms
Wall time: 18.5 s


10

Experiment: retrieve the correct chunck

In [7]:
# Chunk we want to retrieve
retrieve("William Song, left the firm earlier in February")

[Document(id='a75e58c0-e2bf-4e43-8ac9-992d68fa1838', metadata={'source': 'C:/Users/gabreu/Desktop/MyRAG/data/tpil-q4-2023-investor-letter.pdf', 'file_path': 'C:/Users/gabreu/Desktop/MyRAG/data/tpil-q4-2023-investor-letter.pdf', 'page': 1, 'total_pages': 3, 'Author': 'Ryan Holland', 'Comments': '', 'Company': '', 'CreationDate': "D:20240215083529-05'00'", 'Creator': 'Acrobat PDFMaker 23 for Word', 'Keywords': '', 'ModDate': "D:20240215083531-05'00'", 'Producer': 'Adobe PDF Library 23.8.53', 'SourceModified': 'D:20240215064828', 'Subject': '', 'Title': '', 'start_index': 1680}, page_content='expects to see a strong bid for these assets as more investors seek them out.\nBusiness Update - Senior Appointments at the Investment Manager\n• Third Point’s Chief Compliance Officer, William Song, left the firm earlier in February. Over the\npast fifteen years, Will created and oversaw a robust compliance program, drawing on his\nprior experience at the Securities & Exchange Commission. Will staye

In [11]:
# Experiment
chunk_id = retrieve("William Song, left the firm earlier in February")[0].id

search_types = ["similarity", "mmr"]

queries = ["Who left?",
           "Who left the firm in February?",
           "Who left the firm?",
           "Who left third point?",
           "Who left third point in February?"]

# experiment for different promprs and search methods
for s in search_types:
    print(s)
    for q in queries:
        ids = [doc.id for doc in retrieve(q, s, 2)]
        print(chunk_id in ids)

similarity
True
True
True
True
True
mmr
False
False
False
False
False


In [14]:
# Try max marginal relevance
for q in queries:
    ids = [doc.id for doc in retrieve_max(q, 3)]
    print(chunk_id in ids)

False
False
False
False
False
