In [1]:
! pip install -U langchain-nomic langchain_community tiktoken chromadb langchainhub langchain langgraph tavily-python gpt4all firecrawl-py

Collecting langchain-nomic
  Downloading langchain_nomic-0.1.1-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.1-py3-none-any.whl.metadata (8.9 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl.metadata (7.3 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.17-py3-none-any.whl.metadata (621 bytes)
Collecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl.metadata (13 kB)
Collecting langgraph
  Downloading langgraph-0.0.57-py3-none-any.whl.metadata (23 kB)
Collecting tavily-python
  Downloading tavily_python-0.3.3-py3-none-any.whl.metadata (4.4 kB)
Collecting gpt4all
  Downloading gpt4all-2.6.0-py3-none-macosx_10_15_universal2.whl.metadata (4.1 kB)
Collecting firecrawl-py
  Downloading firecrawl_py-0.0.11-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-core<0.3,>=0.1.46 (from langchain-

In [29]:
import os
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

langchain_api_key = os.getenv('LANG_KEY') # replace with your own key
jina_key = os.getenv('JINA_KEY') # replace with your own key 

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key

In [2]:
local_llm = 'llama3' #Using llama3 but you can use anything you want

In [30]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.docstore.document import Document
import requests

#bunch of randomly generated URLs (restricting to 3 to not destroy API token limits)

urls = {
    'https://en.wikipedia.org/wiki/Knowledge_graph',
    'https://en.wikipedia.org/wiki/Semantic_technology',
    'https://en.wikipedia.org/wiki/Semantic_integration'
    # 'https://en.wikipedia.org/wiki/Logical_graph',
    # 'https://en.wikipedia.org/wiki/Knowledge_graph_embedding',
    # 'https://en.wikipedia.org/wiki/Graph_database',
    # 'https://en.wikipedia.org/wiki/Formal_semantics_(natural_language)',
    # 'https://en.wikipedia.org/wiki/Artificial_general_intelligence',
    # 'https://en.wikipedia.org/wiki/Recursive_self-improvement',
    # 'https://en.wikipedia.org/wiki/Automated_planning_and_scheduling',
    # 'https://en.wikipedia.org/wiki/Machine_learning',
    # 'https://en.wikipedia.org/wiki/Natural_language_processing'
}

headers = {
   'Accept': 'application/json',
   'Authorization': jina_key
}

base_url = 'https://r.jina.ai/'

docs = [requests.get(base_url+url, headers=headers).json() for url in urls]

docs_list = []

#look up JINA API response format but essentially we are extracting the content and reconstructing metadata from the response
for doc in docs:
    metadata = {k: v for k, v in doc['data'].items() if k != 'content'}
    docs_list.append({"content": doc['data']['content'], "metadata": metadata})


In [26]:
#split document into smaller chunks. Smaller chunk sizes are usually better (not too small) but your results will vary depending on the prompt and your local data. Will take longer to index if chunk size is small but will potentially alleviate loss-in-the-middle issues.

text_splitter = RecursiveCharacterTextSplitter().from_tiktoken_encoder(
    chunk_size=256, chunk_overlap=0
)
doc_splits = text_splitter.create_documents(texts=[doc['content'] for doc in docs_list], metadatas=[doc['metadata'] for doc in docs_list])


# filter out metadata that comes as an array and restrict it to just primitive types
filtered_docs = []

for doc in doc_splits:
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        if doc.metadata is not None:
            clean_metadata = {k: v for k, v in doc.metadata.items() if type(v) in [str, int, float, bool]}
        else:
            clean_metadata = {}
        filtered_docs.append(Document(page_content=doc.page_content, metadata=clean_metadata))


[Document(page_content='[![Image 1: Page semi-protected](https://upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png)](https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi "This article is semi-protected.")\n\nFrom Wikipedia, the free encyclopedia\n\n**Semantic integration** is the process of interrelating information from diverse sources, for example calendars and to do lists, email archives, presence information (physical, psychological, and social), documents of all sorts, contacts (including [social graphs](https://en.wikipedia.org/wiki/Social_graph "Social graph")), search results, and advertising and marketing relevance derived from them. In this regard, [semantics](https://en.wikipedia.org/wiki/Semantics "Semantics") focuses on the organization of and action upon [information](https://en.wikipedia.org/wiki/Information "Information") by acting as an intermediary between heterogeneous data sources, which may confl

In [None]:
vectorstore = Chroma.from_documents(
    documents=filtered_docs,
    collection_name="rag-chroma",
    embedding=GPT4AllEmbeddings(),
)