In [3]:
FILE_PATH="/Users/gmanvel/repos/rag-fast-flow/data/fast_flow.pdf"
JSON_OUTPUT_PATH="/Users/gmanvel/repos/rag-fast-flow/data/fast_flow_extracted.json"
%pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.


In [4]:
import fitz
import re

HEADER_SIZE = 34
SECTION_SIZE = 18
CONTENT_SIZE = 13
TOL = 0

def dominant_size(block):
    sizes = []
    for line in block.get("lines", []):
        for span in line.get("spans", []):
            sizes.append(span.get("size"))
    if not sizes:
        return None
    rounded = [round(s, 1) for s in sizes if s is not None]
    if not rounded:
        return None
    freq = {}
    for s in rounded:
        freq[s] = freq.get(s, 0) + 1
    return max(freq.items(), key=lambda kv: kv[1])[0]

def block_text(block):
    parts = []
    for line in block.get("lines", []):
        for span in line.get("spans", []):
            parts.append(span.get("text", ""))
    text = "".join(parts)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\s*\n\s*", "\n", text)
    return text.strip()

def classify(size):
    if size is None:
        return None
    if abs(size - HEADER_SIZE) <= TOL:
        return "header"
    if abs(size - SECTION_SIZE) <= TOL:
        return "section"
    if abs(size - CONTENT_SIZE) <= TOL:
        return "content"
    return None

def sanitize_str(s):
    if s is None:
        return s
    return s.encode("utf-8", "replace").decode("utf-8")

def sanitize(obj):
    if isinstance(obj, dict):
        return {sanitize(k): sanitize(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [sanitize(x) for x in obj]
    if isinstance(obj, str):
        return sanitize_str(obj)
    return obj

In [5]:
doc = fitz.open(FILE_PATH)

point = []
current_header = None
current_section = None

for page in doc[2:64]:
    data = page.get_text("dict")
    blocks = data.get("blocks", [])
    blocks_sorted = sorted(blocks, key=lambda b: (b.get("bbox", [0,0,0,0])[1], b.get("bbox", [0,0,0,0])[0]))
    for b in blocks_sorted:
        if b.get("type") != 0:
            continue
        size = dominant_size(b)
        kind = classify(size)
        if kind is None:
            continue
        text = block_text(b)
        if not text:
            continue
        if kind == "header":
            current_header = {"header": text, "sections": []}
            point.append(current_header)
            current_section = None
        elif kind == "section":
            if current_header is None:
                current_header = {"header": "", "sections": []}
                point.append(current_header)
            current_section = {"tile": text, "content": ""}
            current_header["sections"].append(current_section)
        elif kind == "content":
            if current_section is None:
                if current_header is None:
                    current_header = {"header": "", "sections": []}
                    point.append(current_header)
                current_section = {"tile": "", "content": ""}
                current_header["sections"].append(current_section)
            if current_section["content"]:
                current_section["content"] += text #"\n" + text
            else:
                current_section["content"] = text

for h in point:
    h["header"] = h["header"].strip()
    cleaned_sections = []
    for s in h["sections"]:
        s["tile"] = s.get("tile", "").strip()
        s["content"] = s.get("content", "").strip()
        if s["tile"] or s["content"]:
            cleaned_sections.append(s)
    h["sections"] = cleaned_sections

point = sanitize(point)

#result

# import json
# with open(JSON_OUTPUT_PATH, "w", encoding="utf-8") as f:
#     json.dump(result, f, ensure_ascii=False, indent=2)

In [6]:
%pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [7]:
import tiktoken

enc = tiktoken.encoding_for_model("gpt-4-turbo")

def count_tokens(text: str) -> int:
    if not text:
        return 0
    return len(enc.encode(text))

for header in point:    
    for section in header["sections"]:
        section["token_count"] = count_tokens(section.get("content", ""))

point

[{'header': 'Introduction',
  'sections': [{'tile': '',
    'content': 'Matthew Skelton, Co-author of Team TopologiesIn the world of modern software development, speed is a major differentiator. The arrival of cloud computing has transformed the way in which software is developed and has substantially reduced delivery times. Any organization that cannot deliver (and sense the market) fast enough will struggle to compete, and therefore achieving a fast flow of change is essential. Business agility and faster software delivery requires organizations to not only consider the technical aspects of software development but also the social structures and team interactions. Effective flow of value requires an understanding of boundaries between domains, something that Domain Driven Design (DDD) has been helping to achieve for many years. However, it is also important to understand the dependencies and interactions between the teams that own those domains. This is something that can be achieved

In [8]:
%pip install langchain
%pip install langchain_experimental
%pip install langchain_openai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
import json

with open(JSON_OUTPUT_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract all content from the sections
texts = []
for header in data:
    for section in header["sections"]:
        if section.get("content"):
            texts.append(section["content"])

print(f"Extracted {len(texts)} text sections.")

text_splitter = SemanticChunker(OpenAIEmbeddings())
documents = text_splitter.create_documents(texts=texts, metadatas=[{} for _ in texts])
print(f"Created {len(documents)} documents.")

Extracted 42 text sections.


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
%pip install langchain-community

In [None]:
import json
import numpy as np
from langchain_community.embeddings import OllamaEmbeddings

# Initialize OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Load the JSON data
with open(JSON_OUTPUT_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Process sections and create embeddings
sections_with_embeddings = []

for header in data:
    for section in header["sections"]:
        if section.get("tile"):  # Only process sections with titles
            # Create embedding for the content
            embedding_vector = embeddings.embed_query(section["content"])
            
            # Create a section object with title, text, and embedding
            section_obj = {
                "title": section["tile"],
                "text": section["content"],
                "embedding": embedding_vector
            }
            
            sections_with_embeddings.append(section_obj)

print(f"Created embeddings for {len(sections_with_embeddings)} sections")

Created embeddings for 54 sections


In [None]:
len(sections_with_embeddings[0]["embedding"])

768

In [None]:
# Save to JSON file
output_file = "/Users/gmanvel/repos/rag-fast-flow/data/sections_with_embeddings.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(sections_with_embeddings, f, ensure_ascii=False, indent=2)

print(f"Saved embeddings to {output_file}")

Saved embeddings to /Users/gmanvel/repos/rag-fast-flow/data/sections_with_embeddings.json


In [None]:
%pip install qdrant-client

In [None]:
from qdrant_client import QdrantClient

# Connect to local Qdrant instance
client = QdrantClient(host="localhost", port=6333)

# Verify connection
print(f"Connected to Qdrant at localhost:6333")
print(f"Collections: {client.get_collections()}")

Connected to Qdrant at localhost:6333
Collections: collections=[]


In [None]:
from qdrant_client.models import Distance, VectorParams

collection_name = "fast_flow_sections"

# Recreate collection (delete if exists)
try:
    client.delete_collection(collection_name=collection_name)
    print(f"Deleted existing collection '{collection_name}'")
except Exception as e:
    print(f"No existing collection to delete: {e}")

# Create new collection
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

print(f"Created collection '{collection_name}' with vector size 768 and cosine distance")

Deleted existing collection 'fast_flow_sections'
Created collection 'fast_flow_sections' with vector size 768 and cosine distance


In [None]:
from qdrant_client.models import PointStruct
import json

# Load embeddings from JSON
embeddings_file = "/Users/gmanvel/repos/rag-fast-flow/data/sections_with_embeddings.json"
with open(embeddings_file, 'r', encoding='utf-8') as f:
    sections_data = json.load(f)

# Prepare points for insertion
points = []
for idx, section in enumerate(sections_data):
    point = PointStruct(
        id=idx,
        vector=section["embedding"],
        payload={
            "title": section["title"],
            "text": section["text"]
        }
    )
    points.append(point)

# Insert points into Qdrant
client.upsert(
    collection_name=collection_name,
    points=points
)

print(f"Successfully inserted {len(points)} points into '{collection_name}' collection")

Successfully inserted 54 points into 'fast_flow_sections' collection


In [None]:
# Verify insertion by getting collection info
collection_info = client.get_collection(collection_name=collection_name)
print(f"Collection '{collection_name}' info:")
print(f"  - Total points: {collection_info.points_count}")
print(f"  - Vector size: {collection_info.config.params.vectors.size}")
print(f"  - Distance metric: {collection_info.config.params.vectors.distance}")

# Test a simple search with the first embedding
if len(sections_data) > 0:
    print("\nTesting search with first section's embedding...")
    search_results = client.search(
        collection_name=collection_name,
        query_vector=sections_data[0]["embedding"],
        limit=3
    )
    
    print(f"\nTop 3 similar sections:")
    for i, point in enumerate(search_results, 1):
        print(f"\n{i}. Score: {point.score:.4f}")
        print(f"   Title: {point.payload['title']}")
        print(f"   Text preview: {point.payload['text'][:100]}...")

In [None]:
query_vector = embeddings.embed_query("What is the Wardley doctrine?")
waldey_doctrine = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=3,
    with_payload=True
)

print(f"\nTop 3 similar sections:")
for i, point in enumerate(waldey_doctrine.points, 1):
    print(f"\n{i}. Score: {point.score:.4f}")
    print(f"   Title: {point.payload['title']}")
    print(f"   Text preview: {point.payload['text']}...")

In [None]:
%pip install llama_index
%pip install llama_index-embeddings-ollama

In [22]:
from llama_index.core import Document
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.ollama import OllamaEmbedding

In [24]:
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import PointStruct

collection_name = "fast_flow"

# Create new collection
# client.create_collection(
#     collection_name=collection_name,
#     vectors_config=VectorParams(size=768, distance=Distance.COSINE),
# )
ollama_embeddings = OllamaEmbedding(model_name="nomic-embed-text")
splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=70,
    embed_model=ollama_embeddings
)

points: list[PointStruct] = []
for index, section in enumerate(sections_data):
    #print(f"Processing section title: {section['title']}")
    #print(f"Content:{section['text']}")
    nodes = splitter.get_nodes_from_documents(documents=[Document(text=section["text"])])
    chunks = [(node.embedding, node.get_content()) for node in nodes]

    #print(f"Number of chunks created: {len(chunks)}")
    for inner_index, (_, content) in enumerate(chunks):
        if not content.strip() or content.strip() == "Summary":
            continue
        emb = ollama_embeddings.get_text_embedding(content)
        point = PointStruct(
            id=index*10 + inner_index,
            vector=emb,
            payload={
                "title": section["title"],
                "text": content
            }
        )
        points.append(point)
        #print(f"Embedding (first 5 values): {emb[:5]}")
        #print(f"Content: {content}\n")

client.upsert(
    collection_name=collection_name,
    points=points
)

NameError: name 'sections_data' is not defined

In [None]:
from qdrant_client.models import QueryResponse

query_vector = ollama_embeddings.get_text_embedding("What is the Wardley doctrine?")
waldey_doctrine: QueryResponse = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=3
)
print(f"\nTop 3 similar sections:")
for i, point in enumerate(sorted(waldey_doctrine.points, key=lambda p: p.score, reverse=True), 1):
    print(f"\n{i}. Score: {point.score:.4f}")
    print(f"   Title: {point.payload['title']}")
    print(f"   Text preview: {point.payload['text']}...")

In [None]:
#waldey_doctrine.points[1].payload['text']
%pip install -U llama-index-core==0.11.15 llama-index-llms-ollama==0.2.1

In [11]:
import importlib.metadata

for pkg in [
    "llama-index-core",
    "llama-index-llms-ollama",
    "llama-index-embeddings-ollama",
    "llama-index",
]:
    try:
        print(pkg, importlib.metadata.version(pkg))
    except importlib.metadata.PackageNotFoundError:
        print(pkg, "NOT INSTALLED")


llama-index-core 0.14.5
llama-index-llms-ollama 0.8.0
llama-index-embeddings-ollama 0.8.3
llama-index 0.14.5


In [12]:
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage, MessageRole


llm = Ollama(model="mistral", temperature=0)

messages = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content="You are a consultant specializing in fast flow methodologies using the Wardley mappings, DDD & Team Topologies. You are hired to consult and explain concepts around fast flow concepts."
    ),
    ChatMessage(
        role=MessageRole.USER,
        content="What is the Wardley doctrine?"
    )
]

response = llm.chat(messages)
print(f"Response from LLM: {response.message.content}")

Response from LLM:  The Wardley Doctrine is a framework developed by Simon Wardley for understanding, mapping, and navigating the evolution of different types of services or products within an organization's value chain. It provides a strategic lens to help organizations make informed decisions about where to focus their efforts for innovation, optimization, and investment.

The Wardley Doctrine is based on four key principles:

1. Mapping: Visualizing the components of a system and their evolutionary stage (genesis, custom built, productized, or infrastructure) using Wardley Maps. This helps organizations understand the competitive landscape, identify opportunities for innovation, and make informed decisions about where to invest.
2. Positioning: Understanding the position of each component within the value chain and making strategic decisions about whether to build, buy, or avoid it. This helps organizations optimize their resources and focus on areas that provide the most value.
3. 

In [20]:
from llama_index.embeddings.ollama import OllamaEmbedding
from qdrant_client import QdrantClient
from qdrant_client.models import QueryResponse

collection_name = "fast_flow"

# Connect to local Qdrant instance
client = QdrantClient(host="localhost", port=6333)
ollama_embeddings = OllamaEmbedding(model_name="nomic-embed-text")

user_query = "What is the Doctrine in Wardley Maps?"
query_vector = ollama_embeddings.get_text_embedding(user_query)
wardley_doctrine: QueryResponse = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=1
)

messages = [
    ChatMessage(
        role=MessageRole.SYSTEM,
        content= (
            f"You are a consultant specializing in fast flow methodologies using the Wardley mappings, DDD & Team Topologies.\n"
            f"You are hired to consult and explain concepts around fast flow concepts."
            f"Context from your notes:{wardley_doctrine.points[0].payload['text']}"
        )

    ),
    ChatMessage(
        role=MessageRole.USER,
        content=user_query
    )
]

response = llm.chat(messages)
print(f"Response from LLM: {response.message.content}")

Response from LLM:  In Wardley Maps, the Doctrine refers to the set of assumptions or beliefs that underpin a particular domain or landscape. These doctrines are often deeply ingrained and can be difficult to change, as they influence how people think about problems and solutions within the domain.

The Doctrine is represented on the Wardley Map as a horizontal line at the top, above the Landscape, Value Chain, and Episodes layers. It's important to understand the doctrines in a given landscape because they can significantly impact the strategies for evolution and transformation. By identifying and challenging limiting doctrines, it may be possible to find new ways to improve the landscape and achieve faster flow.

For example, in a traditional IT landscape, a common doctrine might be that all applications must be built and maintained internally. Challenging this doctrine could lead to exploring options for outsourcing or using cloud-based solutions, which could help streamline process