In [2]:
from dotenv import load_dotenv

load_dotenv()

import os
import pandas as pd

In [3]:
from langchain_ollama import ChatOllama

# Load Ollama Model


In [4]:
llm_model_name = "llama3.2:1b-instruct-fp16"

llm = ChatOllama(model=llm_model_name, temperature=0)
llm_json_mode = ChatOllama(model=llm_model_name, temperature=0, format="json")

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from src.utils.chroma import get_chroma_client

from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

In [6]:
embedding_model_name = "nomic-embed-text:latest"
embedding_model = OllamaEmbeddings(model=embedding_model_name)

In [7]:
vector_collection_name = "local_rag"


chroma_client = get_chroma_client(
    host=os.environ.get("CHROMA_HOST"),
    port=os.environ.get("CHROMA_PORT"),
    auth_token=os.environ.get("CHROMA_SERVER_AUTHN_CREDENTIALS"),
)

# clear collection
# try:
#     chroma_client.delete_collection(vector_collection_name)
#     print(f"Deleted collection {vector_collection_name}")
# except:
#     print(f"Collection {vector_collection_name} does not exist")
#     pass


vectorstore = Chroma(
    client=chroma_client, collection_name=vector_collection_name, embedding_function=embedding_model
)

In [8]:
retriever = vectorstore.as_retriever(k=10)

# Indexing


In [9]:
from src.utils.file_utils import list_all_files
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import markdown as markdown_textsplitter

from langchain_core.documents import Document

RAG_DATA_DIR = "rag-data"

In [10]:
splitter = markdown_textsplitter.MarkdownTextSplitter()

In [11]:
md_files = list_all_files(RAG_DATA_DIR, '.md')

df = pd.DataFrame(md_files, columns=['abs_path'])
df['path'] = df['abs_path']

In [12]:
# for _, row in df.iterrows():
#     abs_path = row['abs_path']
#     ref_path = row['path']

#     with open(abs_path, 'r') as f:
#         content = f.read()
#         chunks = splitter.split_text(content)

#         for chunk_idx, chunk in enumerate(chunks):
#             metadata = {'file': ref_path, 'chunk_id': chunk_idx + 1}
#             doc = chunk
#             chunk_id = f'{ref_path}_{chunk_idx + 1}'

#             doc_obj = Document(
#                 page_content=chunk,
#                 metadata={'file': ref_path, 'chunk_id': chunk_idx + 1},
#                 id=chunk_id,
#             )

#             vectorstore.add_documents(documents=[doc_obj], ids=[chunk_id])

In [13]:
vectorstore.similarity_search("functional programming", k=10)

[Document(metadata={'chunk_id': 1, 'file': 'rag-data/work-garden-2024/Notes/Logseq-work/pages/Immutable.md'}, page_content='- [[Functional Programming]]'),
 Document(metadata={'chunk_id': 1, 'file': 'rag-data/work-garden-2024/Notes/Logseq-work/pages/Functional Programming Languages encourage using the functional style.md'}, page_content='- [[Functional Programming]]'),
 Document(metadata={'chunk_id': 1, 'file': 'rag-data/work-garden-2024/Daily Notes/2023-07-13.md'}, page_content='- [[Functional Programming]]\n-'),
 Document(metadata={'chunk_id': 1, 'file': 'rag-data/work-garden-2024/Daily Notes/2023-09-16.md'}, page_content='- [[But what is Functional Programming]]'),
 Document(metadata={'chunk_id': 1, 'file': 'rag-data/work-garden-2024/Daily Notes/2024-04-13.md'}, page_content='[[Why Functional Programming Matters]]'),
 Document(metadata={'chunk_id': 1, 'file': 'rag-data/work-garden-2024/Daily Notes/2024-04-05.md'}, page_content='[[Dear functional Bros]]\n[[Functional programming is a

In [None]:
res = retriever.invoke("How to choose colour for data visualisation")

In [21]:
res_dict = []
for r in res:
    res_dict.append(
        {
            'path': r.metadata.get('file'),
            'chunk_id': r.metadata.get('chunk_id'),
            'content': r.page_content,
        }
    )
df = pd.DataFrame(res_dict)

In [20]:
df

Unnamed: 0,file,chunk_id,content
0,rag-data/work-garden-2024/Notes/Logseq-work/pa...,1,- #[[§ 🧐 How to]]\n- Inspired by [[John Burn-M...
1,rag-data/work-garden-2024/Daily Notes/2023-04-...,1,- If I have to recommend a book for data visua...
2,rag-data/work-garden-2024/Daily Notes/2023-11-...,1,- LATER Data visualisation suggestion by [[¶ D...
3,rag-data/work-garden-2024/Notes/Logseq-work/pa...,1,"-\n > most importantly, we can use variations..."
