# Install Libs

In [None]:
%pip install langchain langchain-google-genai langchain_postgres wikiscraper "psycopg[binary,pool]"

# Imports

In [1]:
from dotenv import load_dotenv
import os

from langchain_core.documents import Document
from langchain_google_genai import (ChatGoogleGenerativeAI,
                                    GoogleGenerativeAIEmbeddings)

import re
import tiktoken
import wikipedia as wk
# import wikiscraper as ws

# Set Configs

In [2]:
print(load_dotenv("./../../../../../envs/invest.env"))
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

True


# Set-up Models

In [3]:
LLM = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest",
                             api_key=GOOGLE_API_KEY,
                             temperature=0,
                             )

EMBEDDING = GoogleGenerativeAIEmbeddings(model="models/embedding-001",
                                         google_api_key=GOOGLE_API_KEY)
# ENCODING = tiktoken.get_encoding("gpt2")

# Load Data

In [4]:
URL = "https://pt.wikipedia.org/wiki/Nikola_Tesla"

# Regular expression to match the language code
L_PATTERN = r"https://(.*?)\.wikipedia\.org/"
T_PATTERN = r"/wiki/([^#]+)"

l_match = re.search(L_PATTERN, URL)
t_match = re.search(T_PATTERN, URL)
if l_match:
    LANGUAGE_CODE = l_match.group(1)
    print(f"The language code is: {LANGUAGE_CODE}")
else:
    LANGUAGE_CODE = 'en'
    print("No language code found in the URL using default 'en'.")
if t_match:
    SLUG = t_match.group(1)
    TITLE = SLUG.replace("_"," ")
    print(f"The article slug is: {SLUG}")
    print(f"The article title is: {TITLE}")
else:
    print("No slug found in the URL, please provide a full URL of an article.")

wk.set_lang(LANGUAGE_CODE)
# page = ws.searchBySlug(SLUG)
# TITLE = page.getTitle()

# Specify the title of the Wikipedia page
wiki = wk.page(TITLE)

# Extract the plain text content of the page, excluding images, tables, and other data.
RAW_TEXT = wiki.content
RAW_DOC = Document(page_content=RAW_TEXT)

print(RAW_DOC.page_content[:100])

The language code is: pt
The article slug is: Nikola_Tesla
The article title is: Nikola Tesla
Nikola Tesla (em sérvio: Никола Тесла; pronunciação sérvia: [nǐkola têsla]; Smiljan, Império Austría


# Split Data

In [6]:
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=2048,
                                  chunk_overlap=100)
DOCS = text_splitter.split_documents([RAW_DOC])

print(len(DOCS))
print(DOCS[0].page_content)

12
Nikola Tesla (em sérvio: Никола Тесла; pronunciação sérvia: [nǐkola têsla]; Smiljan, Império Austríaco, 10 de julho de 1856 — Nova Iorque, 7 de janeiro de 1943) foi um inventor, engenheiro eletrotécnico e engenheiro mecânico sérvio, mais conhecido por suas contribuições ao projeto do moderno sistema de fornecimento de eletricidade em corrente alternada (CA).
Nascido e criado no Império Austríaco, Tesla estudou engenharia e física na década de 1870 sem se formar, e ganhou experiência prática no início da década de 1880 trabalhando em telefonia e na Continental Edison, na nova indústria de energia elétrica. Em 1884, emigrou para os Estados Unidos e se naturalizou cidadão americano. Ele trabalhou por um curto período na Edison Machine Works, em Nova Iorque, antes de começar por conta própria. Com a ajuda de parceiros para financiar e comercializar suas ideias, Tesla montou laboratórios e empresas em Nova Iorque para desenvolver uma variedade de dispositivos elétricos e mecânicos. Seu m

# Create Retriever

In [29]:
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://langchain:langchain@localhost:5432/wiki_summarizer"  # Uses psycopg3!
collection_name = "my_docs"


vector_store = PGVector(
    embeddings=EMBEDDING,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

vector_store.add_documents(DOCS)

['36847886-fb5f-44b3-8092-eb48c53613cd',
 'c51b451f-8239-41c0-9fe2-b19a4d29e11e',
 '9fa42f11-ad70-4686-960a-8954477d3b84',
 '235a7e31-50d1-4c48-a54a-257718969085',
 'f6e4b074-2736-400c-a8e5-8200ee9c1238',
 '3757acbe-0521-4f60-9c44-f63ad57487ea',
 '64da0035-c94d-4e8b-bc73-0af2efc9488b',
 '9edb6ad9-6058-475b-a000-a406118268d6',
 '8f9ce5a1-c1cd-47b6-ba4d-d5d9869efd12',
 '3a9ab509-3de9-4bca-83eb-8666c3290813',
 '850e7d2d-337a-45c0-9742-d30ece85a06c',
 '4d311c30-74e4-44b4-8988-6a115764a3bd']

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

map_prompt = ChatPromptTemplate.from_messages(
    [("system", "Write a concise summary of the following:\\n\\n{context}")]
)

map_chain = map_prompt | LLM | StrOutputParser()

In [41]:
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate, PromptTemplate

map_template = (
"""
The following is a set of documents:

{docs}

Based on this list of docs, please identify the main themes 

Helpful Answer:
"""
)

# Also available via the hub: `hub.pull("rlm/reduce-prompt")`
reduce_template = (
"""
The following is a set of summaries:
{docs}
Take these and distill it into a final, consolidated summary
of the main themes.
"""
)

map_prompt = ChatPromptTemplate([("human", map_template)])
reduce_prompt = ChatPromptTemplate([("human", reduce_template)])

map_chain = map_prompt | LLM | StrOutputParser()
reduce_chain = reduce_prompt | LLM | StrOutputParser()

# Chains

In [5]:
"""Reduce size of chunks"""
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate

REDUCE_TEMPLATE = (
    """
    The following is a set of summaries:

    {docs}

    Take these and distill it into a final, consolidated summary
    using approximately {token_max} words, with a margin of 100 words
    for less or more.
    """
)

# Create the prompt chain
REDUCE_PROMPT = ChatPromptTemplate([("human", REDUCE_TEMPLATE)])
reduce_chain = REDUCE_PROMPT | LLM | StrOutputParser()


# Build a Graph to Orchestrate

In [6]:
"""Build LangGraph workflow"""
import operator
from typing import Annotated, List, Literal, TypedDict

from langchain.chains.combine_documents.reduce import (
    acollapse_docs,
    split_list_of_docs,
)
from langchain_core.documents import Document
from langchain.text_splitter import TokenTextSplitter
from langgraph.constants import Send
from langgraph.graph import END, START, StateGraph

from backend.app.build.models import ENCODING
# from backend.app.build.models import LLM
# from backend.app.build.chains.map import map_chain
# from backend.app.build.chains.reduce import reduce_chain


def length_function(documents: List[Document]) -> int:
    """Get number of tokens for input contents."""
    return sum(LLM.get_num_tokens(doc.page_content) for doc in documents)


class OverallState(TypedDict):
    """
    Overall state of the main graph. It will contain the
    input document contents, corresponding summaries,
    and a final summary.
    """
    # Notice here we use the operator.add
    # This is because we want combine all the summaries we generate
    # from individual nodes back into one list - this is essentially
    # the "reduce" part
    contents: List[str]
    token_max: int
    summaries: Annotated[list, operator.add]
    collapsed_summaries: List[Document]
    final_summary: str


class SummaryState(TypedDict):
    """
    The state of the node that we will "map" all
    documents to in order to generate summaries
    """
    content: str
    token_max: int


def should_collapse(
    state: OverallState,
) -> Literal["collapse_summaries", "generate_final_summary"]:
    """
    Determine whether to collapse the summaries or generate the final summary.

    This function acts as a conditional edge in the graph, deciding the next
    step based on the number of tokens in the collapsed summaries.

    Args:
        state (OverallState): The current state of the graph, containing the
                              collapsed summaries and the maximum
                              allowed tokens.

    Returns:
        Literal["collapse_summaries", "generate_final_summary"]:
            The next step in the process, either to collapse the summaries
            or to generate the final summary.
    """
    num_tokens = length_function(state["collapsed_summaries"])
    if num_tokens > state["token_max"]:
        return "collapse_summaries"
    else:
        return "generate_final_summary"
    

def collect_summaries(state: OverallState) -> dict:
    """
    Collect summaries from the state.

    Args:
        state (OverallState): The current state containing summaries
                              and token_max.

    Returns:
        dict: A dictionary with collapsed summaries and token_max.
    """
    print("COLLECT_SUMMARIES")
    summaries = []
    for summary in state["summaries"]:
        tokens = ENCODING.encode(summary)
        print("Used:", len(tokens),
              "Expected:", state["token_max"])
        summaries.append(Document(summary))
    return {
        "collapsed_summaries": summaries,
        "token_max": state["token_max"]
    }


async def collapse_summaries(state: OverallState) -> dict:
    """
    Collapse summaries into smaller chunks if they exceed the token limit.

    Args:
        state (OverallState): The current state containing collapsed summaries
                              and token_max.

    Returns:
        dict: A dictionary with updated collapsed summaries and token_max.
    """
    doc_lists = split_list_of_docs(
        state["collapsed_summaries"],
        length_function,
        state["token_max"]*(3)
    )
    print("COLLAPSE_SUMMARIES")
    for doc in doc_lists:
        tokens = ENCODING.encode(doc[0].page_content)
        print("Used:", len(tokens),
              "Expected:", state["token_max"])
    results = []
    for doc_list in doc_lists:
        req = {"docs": doc_list[0],
               "token_max": state["token_max"]*(2)}
        results.append(
            Document(await reduce_chain.ainvoke(req))
            )

    return {
        "collapsed_summaries": results,
        "token_max": state["token_max"]
    }


async def generate_summary(state: SummaryState) -> dict:
    """
    Generate a summary for a given document.

    Args:
        state (SummaryState): The current state containing content
                              and token_max.

    Returns:
        dict: A dictionary with generated summaries and token_max.
    """
    print("GENERATE_SUMMARY")
    text_splitter = TokenTextSplitter(chunk_size=state["token_max"]*4,
                                      chunk_overlap=100)
    raw_doc = Document(state["content"])
    docs = text_splitter.split_documents([raw_doc])
    response = []
    for doc in docs:
        req = {"docs": doc,
               "token_max": state["token_max"]*2}
        text = await reduce_chain.ainvoke(req)
        response.append(text)
        tokens = ENCODING.encode(text)
        print("Used:", len(tokens),
              "Expected:", state["token_max"])
    return {
        "summaries": response,
        "token_max": state["token_max"]
    }


async def generate_final_summary(state: OverallState) -> dict:
    """
    Generate the final summary from collapsed summaries.

    Args:
        state (OverallState): The current state containing collapsed summaries
                              and token_max.

    Returns:
        dict: A dictionary with the final summary and token_max.
    """
    print("GENERATE_FINAL_SUMMARY")
    for doc in state["collapsed_summaries"]:
        tokens = ENCODING.encode(doc.page_content)
        print("Used:", len(tokens),
              "Expected:", state["token_max"])
    req = {"docs": state["collapsed_summaries"],
           "token_max": state["token_max"]}
    response = await reduce_chain.ainvoke(req)
    return {
        "final_summary": response,
        "token_max": state["token_max"]
    }


def map_summaries(state: OverallState) -> list:
    """
    Define the logic to map out over the documents.

    Args:
        state (OverallState): The current state containing contents
                              and token_max.

    Returns:
        list: A list of `Send` objects with the name of a node in the graph
              and the state to send to that node.
    """
    return [
        Send(
            "generate_summary",
            {
                "content": content,
                "token_max": state["token_max"]
            }
        ) for content in state["contents"]
    ]


# Construct the graph
# Nodes:
graph = StateGraph(OverallState)
graph.add_node("generate_summary", generate_summary)  # same as before
graph.add_node("collect_summaries", collect_summaries)
graph.add_node("collapse_summaries", collapse_summaries)
graph.add_node("generate_final_summary", generate_final_summary)

# Edges:
graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
graph.add_edge("generate_summary", "collect_summaries")
graph.add_conditional_edges("collect_summaries", should_collapse)
graph.add_conditional_edges("collapse_summaries", should_collapse)
graph.add_edge("generate_final_summary", END)

workflow = graph.compile()



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langgraph.pregel import Channel, Pregel


# View & Validate Orchestration Architecture

In [None]:
from IPython.display import Image

Image(workflow.get_graph().draw_mermaid_png())

# Run the App

In [9]:
req = {"contents": [doc.page_content for doc in [RAW_DOC]],
       "token_max": 2000}

ouput = await workflow.ainvoke(
    input=req,
    config={"recursion_limit": 10},
)
print(list(ouput))

GENERATE_SUMMARY
Used: 566 Expected: 2000
Used: 892 Expected: 2000
Used: 1173 Expected: 2000
COLLECT_SUMMARIES
Used: 566 Expected: 2000
Used: 892 Expected: 2000
Used: 1173 Expected: 2000
COLLAPSE_SUMMARIES
Used: 566 Expected: 2000
GENERATE_FINAL_SUMMARY
Used: 398 Expected: 2000
['contents', 'token_max', 'summaries', 'collapsed_summaries', 'final_summary']


# Validate result

In [10]:
from IPython.display import Markdown, display

display(Markdown(
ouput["final_summary"]
))

Nikola Tesla, a Serbian-American inventor and engineer, revolutionized the world with his groundbreaking contributions to the development of the alternating current (AC) electrical system. Born in 1856, Tesla's passion for science and his prodigious intellect led him to the United States in 1884, where he worked with Thomas Edison before establishing his own laboratories and companies. His work on the AC induction motor and related patents, licensed by Westinghouse Electric, earned him significant wealth and became the cornerstone of the polyphase system.

Driven by a relentless pursuit of innovation, Tesla conducted numerous experiments with mechanical oscillators, electric discharge tubes, and radiography. He also built a remote-controlled boat, showcasing his inventive spirit. His fame grew, and he became known for his public speaking abilities, captivating audiences with his visionary ideas.

Tesla's vision extended beyond the realm of AC power. He pursued wireless lighting and global wireless power distribution through his high-voltage, high-frequency experiments. He envisioned a world powered by wireless energy, a concept that was ahead of its time. His unfinished Wardenclyffe Tower project, an intercontinental wireless transmitter, aimed to realize this vision but was hampered by financial constraints.

Despite financial struggles and the failure of some ambitious projects, Tesla continued to experiment with various inventions throughout his life. He died in 1943, leaving behind a legacy of innovation and eccentricity. His work fell into relative obscurity until 1960, when the SI unit of magnetic flux density was named the tesla in his honor.

Tesla's life was a testament to his relentless pursuit of innovation and his visionary ideas that often outpaced the technological capabilities of his time. His contributions to the AC system revolutionized the way we generate and distribute electricity, while his experiments with wireless power transmission and communication laid the groundwork for future technologies. His legacy continues to inspire and fascinate, reminding us of the power of imagination and the enduring impact of a brilliant and eccentric inventor. 
