In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
%load_ext dotenv
%dotenv

In [3]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, SimpleWebPageReader, SummaryIndex
from llama_index.indices.composability import ComposableGraph

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


In [None]:
documentsDir = SimpleDirectoryReader(input_dir="data-full", recursive=True).load_data()
print(f"Loaded {len(documentsDir)} docs")
indexDir = VectorStoreIndex.from_documents(documentsDir)
indexDir.storage_context.persist(persist_dir="storage-dir")

In [5]:
documentsWeb = SimpleWebPageReader(html_to_text=True).load_data(
    [
      "https://www.sika.com/en/about-us/sustainability/sika-sustainability-strategy.html",
      "https://www.sika.com/en/innovation/research-development/technologies.html",
      "https://www.sika.com/en/about-us/organization/target-markets.html",
      "https://www.sika.com/en/investors/shares/key-data-on-sika-shares.html",
      "https://www.sika.com/en/solutions-for-projects/airports.html",
      "https://www.sika.com/en/knowledge-hub.list.html/knowledge-hub/can-concrete-be-recycled.html"
    ]
)
print(f"Loaded {len(documentsWeb)} docs")
indexWeb = VectorStoreIndex.from_documents(documentsWeb)
indexWeb.storage_context.persist(persist_dir="storage-web")

Loaded 2 docs


In [8]:
from llama_index import StorageContext, load_index_from_storage

# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer
# set Logging to DEBUG for more detailed outputs
storage_context_dir = StorageContext.from_defaults(persist_dir="./storage-dir")
indexDir = load_index_from_storage(storage_context_dir)

storage_context_web = StorageContext.from_defaults(persist_dir="./storage-web")
indexWeb = load_index_from_storage(storage_context_web)

graph = ComposableGraph.from_indices(SummaryIndex, [indexDir, indexWeb], index_summaries=["summary1", "summary2"])

query_engine = graph.as_query_engine()
query_engine.query(
    "Tell me more about Sika sustainability"
)

INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.


Response(response="Sika sustainability encompasses three dimensions: environmental, social, and economic. In terms of environmental sustainability, Sika focuses on addressing climate change, reducing emissions, conserving energy, managing water resources, minimizing waste, promoting circular economy, and protecting biodiversity and nature. In the social dimension, Sika is committed to maintaining high working standards and respecting human rights. Sika's sustainability strategy aligns with the United Nations Sustainable Development Goals (SDGs). Additionally, Sika engages in sustainability reporting, partnerships, and collaborations, and follows ESG (Environmental, Social, and Governance) policies and guidelines.", source_nodes=[NodeWithScore(node=TextNode(id_='550d388c-7d5d-42bc-85aa-fc97db1235c3', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='8bea0484d126a3058e260756254cc35bd204e1238563d4ebc2661862e6ec5170', text=

In [None]:
query_engine = index.as_query_engine()
query_engine.query(
    "What documents exists about SikaWall"
)