# Load RAG Knowledge Base

In [2]:
import os
import shutil

from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import ChatOpenAI
from openai import OpenAI

from dotenv import load_dotenv

from IPython.display import Markdown

home_dir = os.path.expanduser("~")

In [3]:
# db_name = 'recipes_vector_db'
# folders = "Data/knowledge-base/*"
svr_use = 'local'

if svr_use == "local": 
    embeddings_model = 'text-embedding-nomic-embed-text-v1.5'
    embeddings_base_url="http://127.0.0.1:1234/v1"
    embeddings_api_key = 'None'
    
    sorting_model = 'qwen3-14b-mlx'
    sorting_base_url= "http://127.0.0.1:1234/v1"
    sorting_api_key = 'None'

elif svr_use == 'home-svr':
    embeddings_model = 'text-embedding-nomic-embed-text-v1.5'
    embeddings_base_url="http://10.10.10.2:1234/v1"
    embeddings_api_key = 'None'
    
    sorting_model = 'qwen3-14b'
    sorting_base_url= "http://10.10.10.2:1234/v1"
    sorting_api_key = 'None'


chroma_path = f"{home_dir}/Google Drive/My Drive/Projects/Data/chroma"
data_path = f'{home_dir}/Google Drive/My Drive/Projects/Data/Knowledge-base/'

list_of_project = ['Recipes', 'China']

In [4]:
unsorted = 'Unsorted'

In [5]:
def load_documents(data_path):
    text_loader_kwargs = {'encoding': 'utf-8'}
    loader = DirectoryLoader(data_path, glob="**/*.txt",
                                 loader_cls=TextLoader,
                                 loader_kwargs=text_loader_kwargs)
    documents = loader.load()
    return documents

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=50,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)

    return chunks
    
def save_to_chroma(chunks: list[Document], chroma_path):
    embeddings = OpenAIEmbeddings(model = embeddings_model,
                                 base_url=embeddings_base_url,
                                 api_key=embeddings_api_key,
                                 check_embedding_ctx_length=False)
    # Clear out the database first.
    if os.path.exists(chroma_path):
        
        shutil.rmtree(chroma_path)
        print(f'{chroma_path} exists and replacing it with new data')

    # Create a new DB from the documents.
    db = Chroma.from_documents(documents = chunks, 
                               embedding=embeddings, 
                               persist_directory=chroma_path)
    
    # db.persist()
    print(f"Saved {len(chunks)} chunks to {chroma_path}.")

def generate_data_store(data_path, chroma_path):
    documents = load_documents(data_path)
    chunks = split_text(documents)
    save_to_chroma(chunks, chroma_path)


def txt_file_sorter(data_path,unsorted,
                    sorting_model, sorting_base_url,sorting_api_key):
    dir_list = os.listdir(data_path + unsorted)
    sort_list = os.listdir(data_path)
    
    for i in ['.DS_Store', '.ipynb_checkpoints', unsorted]:
        try: 
            sort_list.remove(i)
        except:
            print(f"{i} not in list")
            print('\n\n')
            
    for i in ['.DS_Store', '.ipynb_checkpoints']:
        try: 
            dir_list.remove(i)
        except:
            print(f"{i} not in list")
            print('\n\n')
        
    openai_sorter = OpenAI(base_url=sorting_base_url, 
                      api_key=sorting_api_key)
        
    for f in dir_list:
        # print(f)
        file_path = f'{data_path}{unsorted}/{f}'
        
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()
    
        sorter_system_prompt = """You are a helpful Assitance that reads a text 
                            and sort to a category 
                            
                            Respond in markdown. Respond only the category
                           and do not add any comments."""
            
        sorter_prompt = f"""Sort the text below into this selected category:
                           {','.join(sort_list)}
                            
                            Respond in markdown. Respond only the category
                           and do not add any comments.
        
                           text:
                            {file_content}
                           """
    
        messages = [
                    {"role": "system",
                     "content": sorter_system_prompt,
                    "role": "user",
                     "content": sorter_prompt}
                    ]
        print(f'Sorting: {f}')
        
        response = openai_sorter.chat.completions.create(model = sorting_model,
                                                    messages = messages, 
                                                    temperature=0.0)
        new_cat = response.choices[0].message.content.split('/think>')[-1].replace('\n', '')
        # print(new_cat)
        # print('\n')
        print(f'Sorted: {new_cat}')
        print('\n\n')
        new_file_path = f'{data_path}{new_cat}/{f}'
        os.rename(file_path, new_file_path) 

In [6]:
txt_file_sorter(data_path,unsorted,
                    sorting_model, sorting_base_url,sorting_api_key)
generate_data_store(data_path, chroma_path)

Sorting: New York Style Pizza At Home  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: Chicken Nanban Recipe Report  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: MY NEW FAVORITE CHICKEN & RICE RECIPE (SO EASY  SO DELICIOUS!)  SAM THE COOKING GUY  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: Perfect Curry Noodles At Home (Khao Soi)  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: Easy Mongolian Beef Recipe Report  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: Creamy Tantan Spaghetti Recipe Report  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: Detail Report Korean-Style Meal Prep Recipes from YouTube Video  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: Detail Report Making the Best Jollibee Spaghetti at Home  Sam the Cooking Guy  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: Japanese Wagyu Sandwich Recipe Report  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: The Best OKONOMIYAKI You’ve Never Tried!  Hiroshima Okonomiyaki  _qwen3-14b-mlx.txt
Sorted: Recipes



Sorting: ONE PAN CHIC

In [7]:
for i in list_of_project:
    chroma_path_i = f"{home_dir}/Google Drive/My Drive/Projects/Data/chroma_{i}"
    data_path_i  = f'{home_dir}/Google Drive/My Drive/Projects/Data/Knowledge-base/{i}'
    generate_data_store(data_path_i, chroma_path_i)

/Users/daveng/Google Drive/My Drive/Projects/Data/chroma_Recipes exists and replacing it with new data
Saved 301 chunks to /Users/daveng/Google Drive/My Drive/Projects/Data/chroma_Recipes.
Saved 36 chunks to /Users/daveng/Google Drive/My Drive/Projects/Data/chroma_China.
