In [None]:
!pip install langchain_community chromadb tiktoken unstructured umap hdbscan langchain_openai  umap-learn

In [None]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough

import numpy as np
import umap
import tiktoken
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from typing import List, Dict, Any



In [None]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

embedding_model = OpenAIEmbeddings(openai_api_key = "openai_api_key")
model = ChatOpenAI( max_tokens=512, openai_api_key = "openai_api_key", temperature=0, model="gpt-4")

In [None]:
# Configuration
CONFIG = {
    "chunk_size": 1000,
    "chunk_overlap": 200,
    "embedding_model": OpenAIEmbeddings(openai_api_key = "openai_api_key"),
    "llm": ChatOpenAI(openai_api_key = "openai_api_key", model="gpt-4", temperature=0),
    "max_levels": 3,
    "cluster_threshold": 0.3
}



In [None]:
def hierarchical_raptor() -> RunnablePassthrough:
    # Load and split documents
    loader = DirectoryLoader('data', glob="**/*.txt")
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CONFIG['chunk_size'],
        chunk_overlap=CONFIG['chunk_overlap'],
    )
    chunks = text_splitter.split_documents(docs)

    # Build RAPTOR tree
    tree = build_raptor_tree(chunks, CONFIG['max_levels'])

    # Create hierarchical vector store
    vector_store = create_hierarchical_vectorstore(tree)

    # Build retrieval chain
    return build_retrieval_chain(vector_store)



In [None]:
def build_raptor_tree(chunks: List[Any], max_levels: int) -> Dict:
    tree = {"levels": []}
    current_level = [{"text": c.page_content, "metadata": c.metadata} for c in chunks]

    for level in range(max_levels):
        # Cluster documents
        embeddings = CONFIG['embedding_model'].embed_documents([d["text"] for d in current_level])
        reduced_embeds = reduce_dimensions(embeddings)
        clusters = cluster_documents(reduced_embeds)

        # Summarize clusters
        summarized_clusters = []
        for cluster_id in np.unique(clusters):
            cluster_texts = [current_level[i]["text"] for i in np.where(clusters == cluster_id)[0]]
            summary = summarize_cluster(cluster_texts)
            summarized_clusters.append({
                "text": summary,
                "children": cluster_texts,
                "level": level
            })

        tree["levels"].append(summarized_clusters)
        current_level = summarized_clusters

        # Stop if only one cluster remains
        if len(summarized_clusters) == 1:
            break

    return tree



In [None]:
def reduce_dimensions(embeddings: List[List[float]]) -> np.ndarray:
    if len(embeddings) < 2:
        return np.array(embeddings)

    n_samples = len(embeddings)
    n_neighbors = min(15, max(2, n_samples - 1))

    return umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=0.1,
        metric='cosine'
    ).fit_transform(embeddings)



In [None]:
def cluster_documents(embeddings: np.ndarray) -> np.ndarray:
    min_cluster_size = min(5, len(embeddings) // 2)  #
    clusterer = HDBSCAN(
        min_cluster_size=min_cluster_size,
        gen_min_span_tree=True,
        cluster_selection_epsilon=CONFIG['cluster_threshold']
    )
    return clusterer.fit_predict(embeddings)



In [None]:
def summarize_cluster(texts: List[str]) -> str:
    prompt_template = """Synthesize a comprehensive summary from these documents:
    {context}

    Include all key entities, relationships, and facts. Maintain technical specificity."""

    prompt = ChatPromptTemplate.from_template(prompt_template)
    chain = prompt | CONFIG['llm'] | StrOutputParser()

    return chain.invoke({"context": "\n\n".join(texts)})



In [None]:
def create_hierarchical_vectorstore(tree: Dict) -> Chroma:
    all_texts = []
    for level in tree["levels"]:
        all_texts.extend([node["text"] for node in level])

    return Chroma.from_texts(
        texts=all_texts,
        embedding=CONFIG['embedding_model'],
        metadatas=[{"level": node["level"]} for level in tree["levels"] for node in level]
    )



In [None]:
def build_retrieval_chain(vector_store: Chroma) -> RunnablePassthrough:
    retriever = vector_store.as_retriever(
        search_kwargs={
            "k": 15,
            "filter": {"level": {"$lte": CONFIG['max_levels']-1}}
        }
    )

    prompt_template = """Answer using information from this context:
    {context}

    Question: {question}
    Provide a detailed, structured response."""

    prompt = ChatPromptTemplate.from_template(prompt_template)

    return (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | CONFIG['llm']
        | StrOutputParser()
    )



In [None]:
def format_docs(docs):
    return "\n\n".join(f"Level {doc.metadata['level']} Context:\n{doc.page_content}" for doc in docs)

# Usage
rag_chain = hierarchical_raptor()
print(rag_chain.invoke("What is the best layer 3 design?"))



BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 24699 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

OpenAI response:
The best Layer 3 design involves several key practices and considerations to optimize network performance and ensure reliable data transmission.

Firstly, implementing redundant architectures is crucial to prevent single points of failure. This enhances network resiliency and ensures that data transmission can continue even if one part of the network fails.

Secondly, careful selection of routing protocols is essential for efficiency. Routing protocols, such as Interior Gateway Protocols (IGPs) and Exterior Gateway Protocols (EGPs), determine the most efficient paths for data packets to travel from their source to their destination. IGPs operate within a single autonomous system, managing routing between internal routers, while EGPs operate between different autonomous systems and are crucial for managing the routing of data across the global internet.

Thirdly, adherence to structured hierarchical network models simplifies management and troubleshooting tasks. The hierarchical network model organizes the network into distinct layers: core, distribution, and access. This structure enhances fault isolation and backbone connectivity, and simplifies operational tasks.

Fourthly, addressing security considerations is paramount. It is vital to implement security practices that address potential vulnerabilities in routing protocols to safeguard data integrity and confidentiality. Mechanisms for key rollover and management can enhance the security of data in transit.

Fifthly, effective Layer 3 addressing schemes are foundational to the performance and manageability of a network. Properly structured IP addressing facilitates efficient routing and communication among various network segments. Layer 3 addressing schemes can support both static and dynamic routing.

Lastly, continuous monitoring and maintenance are necessary to ensure the network operates optimally and to address any issues promptly.

Emerging trends such as the adoption of Software-Defined Networking (SDN) and Network Function Virtualization (NFV), enhanced security measures, increased network visibility, and the adoption of Layer 3 switching are also shaping the future of Layer 3 design. These trends highlight the importance of adopting best practices to navigate the complexities of modern network environments.

SHOW TIME -- Lets see who is better

In [None]:
!pip install transformers huggingface_hub langchain

In [None]:
!pip install umap-learn



In [None]:
from umap import UMAP
import numpy as np
from umap import UMAP
import hdbscan
import tiktoken
import warnings

In [None]:
!pip install --upgrade bitsandbytes transformers peft accelerate datasets trl

In [None]:
!pip install -q huggingface-hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch, os
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain_core.runnables import RunnablePassthrough


In [None]:
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(openai_api_key = "openai_api_key")
# model = model_id

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
# Load Llama3 Model
model_id = "eduard76/Llama3-8b-good"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_hf = AutoModelForCausalLM.from_pretrained(model_id)

# Create a text-generation pipeline
hf_pipeline = pipeline(
    "text-generation", model=model_hf, tokenizer=tokenizer, device=0, max_new_tokens=512
)

# Wrap HuggingFacePipeline for LangChain
hf_llm = HuggingFacePipeline(pipeline=hf_pipeline)

# ✅ Now, define CONFIG before using it
CONFIG = {
    "chunk_size": 1000,
    "chunk_overlap": 200,
    "embedding_model": OpenAIEmbeddings(openai_api_key="openai_api_key"),
    "llm": RunnablePassthrough(hf_llm),  # ✅ Now CONFIG exists before using it
    "max_levels": 3,
    "cluster_threshold": 0.3
}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
  hf_llm = HuggingFacePipeline(pipeline=hf_pipeline)


In [None]:
from langchain_core.runnables import RunnablePassthrough
CONFIG['llm'] = RunnablePassthrough(hf_llm)

In [None]:
# Configuration
CONFIG = {
    "chunk_size": 1000,
    "chunk_overlap": 200,
    "embedding_model": OpenAIEmbeddings(openai_api_key = "openai_api_key"),
    "llm": RunnablePassthrough(hf_llm),
    "max_levels": 3,
    "cluster_threshold": 0.3
}

In [None]:
from langchain_core.runnables import RunnablePassthrough
CONFIG['llm'] = HuggingFacePipeline(pipeline=hf_pipeline)

CONFIG["llm"] = RunnablePassthrough(hf_llm)

In [None]:
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

In [None]:
print(f"CONFIG['llm'] type: {type(CONFIG['llm'])}")


CONFIG['llm'] type: <class 'langchain_core.runnables.passthrough.RunnablePassthrough'>


In [None]:
from langchain_core.runnables import RunnableLambda

CONFIG['llm'] = RunnableLambda(lambda x: hf_llm.invoke(x.to_string()) if hasattr(x, 'to_string') else hf_llm.invoke(str(x)))


In [None]:
import umap
import umap.umap_ as umap  #
import numpy as np

In [None]:
!pip install hdbscan



In [None]:
from langchain.prompts import ChatPromptTemplate  #


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter  #
from langchain.document_loaders import DirectoryLoader  #
from langchain_core.runnables import RunnablePassthrough  #

def hierarchical_raptor() -> RunnablePassthrough:
    # Load and split documents
    loader = DirectoryLoader('data', glob="**/*.txt")
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(  #
        chunk_size=CONFIG['chunk_size'],
        chunk_overlap=CONFIG['chunk_overlap'],
    )
    chunks = text_splitter.split_documents(docs)

    # Build RAPTOR tree
    tree = build_raptor_tree(chunks, CONFIG['max_levels'])

    # Create hierarchical vector store
    vector_store = create_hierarchical_vectorstore(tree)

    # Build retrieval chain
    return build_retrieval_chain(vector_store)




In [None]:
from typing import List, Dict, Any  #
import numpy as np  #

def build_raptor_tree(chunks: List[Any], max_levels: int) -> Dict:
    tree = {"levels": []}
    current_level = [{"text": c.page_content, "metadata": c.metadata} for c in chunks]

    for level in range(max_levels):
        # Cluster documents
        embeddings = CONFIG['embedding_model'].embed_documents([d["text"] for d in current_level])
        reduced_embeds = reduce_dimensions(embeddings)
        clusters = cluster_documents(reduced_embeds)

        # Summarize clusters
        summarized_clusters = []
        for cluster_id in np.unique(clusters):
            cluster_texts = [current_level[i]["text"] for i in np.where(clusters == cluster_id)[0]]
            summary = summarize_cluster(cluster_texts)
            summarized_clusters.append({
                "text": summary,
                "children": cluster_texts,
                "level": level
            })

        tree["levels"].append(summarized_clusters)
        current_level = summarized_clusters

        # Stop if only one cluster remains
        if len(summarized_clusters) == 1:
            break

    return tree



In [None]:
def reduce_dimensions(embeddings: List[List[float]]) -> np.ndarray:
    if len(embeddings) < 2:
        return np.array(embeddings)

    n_neighbors = min(15, len(embeddings) - 1)  #

    return umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=0.1,
        metric='cosine'
    ).fit_transform(embeddings)



In [None]:
from hdbscan import HDBSCAN  #
import numpy as np

In [None]:
def cluster_documents(embeddings: np.ndarray) -> np.ndarray:
    clusterer = HDBSCAN(
        min_cluster_size=5,
        gen_min_span_tree=True,
        cluster_selection_epsilon=CONFIG['cluster_threshold']
    )
    return clusterer.fit_predict(embeddings)



In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import StrOutputParser

def summarize_cluster(texts: List[str]) -> str:
    prompt_template = """Synthesize a comprehensive summary from these documents:
    {context}

    Include all key entities, relationships, and facts. Maintain technical specificity."""

    prompt = ChatPromptTemplate.from_template(prompt_template)

    # Ensure CONFIG['llm'] is a valid Runnable
    llm = CONFIG['llm']
    if isinstance(llm, str):
        raise TypeError(f"CONFIG['llm'] is a string. Expected a Runnable. Value: {llm}")

    chain = prompt | (lambda x: CONFIG['llm'].invoke(x.to_string()) if hasattr(x, 'to_string') else CONFIG['llm'].invoke(str(x))) | StrOutputParser()




    return chain.invoke({"context": "\n\n".join(texts)})



In [None]:
from langchain.vectorstores import Chroma  #


In [None]:
from typing import List, Dict, Any  #
from langchain.vectorstores import Chroma  #

def create_hierarchical_vectorstore(tree: Dict) -> Chroma:
    all_texts = []
    for level in tree["levels"]:
        all_texts.extend([node["text"] for node in level])

    return Chroma.from_texts(
        texts=all_texts,
        embedding=CONFIG['embedding_model'],
        metadatas=[{"level": node["level"]} for level in tree["levels"] for node in level]
    )


In [None]:
def build_retrieval_chain(vector_store: Chroma) -> RunnablePassthrough:
    retriever = vector_store.as_retriever(
        search_kwargs={
            "k": 15,
            "filter": {"level": {"$lte": CONFIG['max_levels']-1}}
        }
    )

    prompt_template = """Answer using information from this context:
    {context}

    Question: {question}
    Provide a detailed, structured response."""

    prompt = ChatPromptTemplate.from_template(prompt_template)

    return (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | (lambda x: CONFIG['llm'].invoke(x.to_string()) if hasattr(x, 'to_string') else CONFIG['llm'].invoke(str(x)))
    | StrOutputParser()
    )



In [None]:
def format_docs(docs):
    return "\n\n".join(f"Level {doc.metadata['level']} Context:\n{doc.page_content}" for doc in docs)

# Usage
rag_chain = hierarchical_raptor()
print(rag_chain.invoke("What is the best layer 3 design?"))



Human: Answer using information from this context:
    Level 0 Context:
Layer 3 design practices are crucial for optimizing network performance and ensuring reliable data transmission across complex infrastructures. Operating within the Network Layer of the OSI model, Layer 3 is responsible for routing data packets using logical addressing schemes, such as IP addresses. Effective Layer 3 design is particularly important in large and dynamic environments where scalability, redundancy, and security are paramount.

Key aspects of Layer 3 design include the implementation of redundant architectures to prevent single points of failure, careful selection of routing protocols for efficiency, and adherence to structured hierarchical network models. These practices enhance network resiliency and simplify management and troubleshooting tasks. However, challenges such as configuration complexity, security vulnerabilities, and the need for continuous monitoring remain significant concerns for netw

Response from Llama: Provide a detailed, structured response. Explain the advantages of the recommended layer 3 design. Discuss the importance of redundancy and high availability in layer 3 designs. Highlight the benefits of using layer 3 switches and routing protocols. Emphasize the role of hierarchical network models in efficient data transmission. Describe the importance of security considerations in layer 3 designs. Mention the relevance of continuous monitoring and maintenance in layer 3 designs. Provide a conclusion summarizing the key points. Use appropriate language and tone. Avoid personal opinions or bias. Ensure accuracy and correctness. Verify the information with reliable sources. Provide sufficient context and background information. Align with the target audience's knowledge and experience. Avoid unnecessary details. Ensure readability and clarity. Check for grammatical errors and inconsistencies. Ensure that the information is up-to-date and accurate