In [25]:
import os
import copy
from langchain.document_loaders import WebBaseLoader
import bs4

In [30]:
# load
web_paths = ("https://adtechbook.clearcode.cc/introduction/",)

# bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=web_paths,
    # bs_kwargs={"parse_only": bs4_strainer},
)
_docs = loader.load()

In [31]:
import re
def clean_text(text:str):
    cleaned = re.sub("\n+", "\n", text)
    cleaned = re.sub("\t+", "\t", cleaned)
    cleaned = cleaned.strip()
    return cleaned

In [34]:
docs = copy.deepcopy(_docs)
# print(docs)
for doc in docs:
    doc.page_content = clean_text(doc.page_content)
print(docs)

[Document(metadata={'source': 'https://adtechbook.clearcode.cc/introduction/', 'title': '01. Introduction | The AdTech Book By Clearcode', 'description': 'The AdTech Book has been contributed to and written by various team members of Clearcode — a full-service software development house that specializes in building AdTech and MarTech platforms.', 'language': 'en-US'}, page_content='01. Introduction | The AdTech Book By Clearcode\n \nAsk us a question\n01.\n\tIntroduction\n\t\n02.\n\tAdvertising Basics\n\t\n03.\n\tThe History of Digital Advertising Technology\n\t\n04.\n\tThe Main Technology Platforms and Intermediaries in the Digital Advertising Ecosystem\n\t\n05.\n\tThe Main Digital Advertising Mediums and Channels\n\t\n06.\n\tAd Serving\n\t\n07.\n\tAd Targeting and Budget Control\n\t\n08.\n\tTracking and Reporting Impressions, Clicks, and Conversions in AdTech Platforms\n\t\n09.\n\tMedia-Buying Methods: Programmatic, Real-Time Bidding (RTB), Header Bidding, and PMP\n\t\n10.\n\tUser Id

In [27]:
# splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

14

In [28]:
all_splits

[Document(metadata={'source': 'https://adtechbook.clearcode.cc/introduction/', 'title': '01. Introduction | The AdTech Book By Clearcode', 'description': 'The AdTech Book has been contributed to and written by various team members of Clearcode — a full-service software development house that specializes in building AdTech and MarTech platforms.', 'language': 'en-US', 'start_index': 0}, page_content='01. Introduction | The AdTech Book By Clearcode\n \nAsk us a question\n01.\n\tIntroduction\n\t\n02.\n\tAdvertising Basics\n\t\n03.\n\tThe History of Digital Advertising Technology\n\t\n04.\n\tThe Main Technology Platforms and Intermediaries in the Digital Advertising Ecosystem\n\t\n05.\n\tThe Main Digital Advertising Mediums and Channels\n\t\n06.\n\tAd Serving\n\t\n07.\n\tAd Targeting and Budget Control\n\t\n08.\n\tTracking and Reporting Impressions, Clicks, and Conversions in AdTech Platforms\n\t\n09.\n\tMedia-Buying Methods: Programmatic, Real-Time Bidding (RTB), Header Bidding, and PMP\n

In [40]:
# store
from langchain_chroma import Chroma
from langchain_community.embeddings.huggingface import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5")
persist_directory = ""

vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="./.chroma")

In [53]:
# retrieve
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.invoke("Who is this Book for?")
retrieved_docs

[Document(metadata={'description': 'The AdTech Book has been contributed to and written by various team members of Clearcode — a full-service software development house that specializes in building AdTech and MarTech platforms.', 'language': 'en-US', 'source': 'https://adtechbook.clearcode.cc/introduction/', 'start_index': 3448, 'title': '01. Introduction | The AdTech Book By Clearcode'}, page_content='And thus the idea of The AdTech Book was created.\nWho Is The AdTech Book Written For?\nThe AdTech Book is ideal for anyone wanting to learn about the history of online advertising and understand how the different elements of the digital advertising technology ecosystem work, what their roles are, and the relationships between different parties in the industry.\nEven though the book contains a lot of highly technical and detailed explanations, we’ve tried to write the AdTech Book in the most straightforward way possible so that anyone can read and understand the contents of the book.\nMo

In [54]:
print(retrieved_docs[0].page_content)

And thus the idea of The AdTech Book was created.
Who Is The AdTech Book Written For?
The AdTech Book is ideal for anyone wanting to learn about the history of online advertising and understand how the different elements of the digital advertising technology ecosystem work, what their roles are, and the relationships between different parties in the industry.
Even though the book contains a lot of highly technical and detailed explanations, we’ve tried to write the AdTech Book in the most straightforward way possible so that anyone can read and understand the contents of the book.
More specifically, the book will greatly benefit:
C-level executives and founders at advertising and marketing companies.
Advertisers and marketers who work in-house or at advertising agencies. 
Programmers and technical teams that build advertising and marketing technology.
Publishers and content creators who monetize their content with advertising.


In [63]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "human",
            (
                "You are an assistant for question-answering tasks. "
                "Use the following pieces of retrieved context to answer the question. "
                "If you don't know the answer, just say that you don't know. "
                "Use three sentences maximum and keep the answer concise. \n"
                "Question: {question} \n"
                "Context: {context} \n"
                "Answer:"
            ),
        ),
    ]
)

In [66]:
example_messages = prompt.invoke({
    "context": "filler context",
    "question": "filler question"
}).to_messages()
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. 
Question: filler question 
Context: filler context 
Answer:


In [86]:
from langchain_community.chat_models.ollama import ChatOllama
from langchain_core.language_models.chat_models import BaseChatModel

llm = ChatOllama(model_name="phi3")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def invoke_chat(user_input: str, k=3, llm: BaseChatModel = None):
    context = retriever.invoke(user_input, k=k)
    context = format_docs(context)
    message = prompt.invoke({
        "context": context,
        "question": user_input
    })
    if llm:
        message = llm.invoke(message)
    return message

In [87]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt
)

In [88]:
rag_chain.invoke("who is this book for?")

ChatPromptValue(messages=[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. \nQuestion: who is this book for? \nContext: And thus the idea of The AdTech Book was created.\nWho Is The AdTech Book Written For?\nThe AdTech Book is ideal for anyone wanting to learn about the history of online advertising and understand how the different elements of the digital advertising technology ecosystem work, what their roles are, and the relationships between different parties in the industry.\nEven though the book contains a lot of highly technical and detailed explanations, we’ve tried to write the AdTech Book in the most straightforward way possible so that anyone can read and understand the contents of the book.\nMore specifically, the book will greatly benefit:\nC-level executives and fou

In [93]:
response = invoke_chat("who wrote this book?", k=3, llm=llm)

In [94]:
print(response.content)

The book "The AdTech Book" was written by Clearcode Services S.A. for various audiences, including C-level executives, advertisers, marketers, programmers, and content creators. The book aims to provide a comprehensive understanding of the history of online advertising, the different platforms and processes involved, and the relationships between various parties in the industry. The authors, Piotr Banaszczyk and Tomasz Chmielewski, are experienced professionals in the AdTech industry and created the book as a resource for those looking to learn about the subject matter.
