In [53]:
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from IPython.display import Markdown, display
import os
from llama_index.core.query_engine import CitationQueryEngine
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.readers.web import TrafilaturaWebReader
import pandas as pd
from pydantic.v1.error_wrappers import ValidationError
from joblib import Parallel, delayed
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

In [39]:
import requests

def check_url(url):
    try:
        response = requests.get(url)
        if response.ok:   # alternatively you can use response.status_code == 200
            return True
        else:
            return False
    except:
        return False

In [38]:
requests.get('https://docs.twelvelabs.io/v1.2/docs/introduction')

<Response [200]>

In [4]:
df = pd.read_table('links.txt', header=None)
df.columns = ['category','url','picture','date']
df.head()

Unnamed: 0,category,url,picture,date
0,Home Page,https://twelvelabs.io/,no,Internal
1,Main menu,https://twelvelabs.io/technology,no,Internal
2,Main menu,https://twelvelabs.io/product,no,Internal
3,Guides v1.2,https://docs.twelvelabs.io/v1.2/docs/introduction,yes,Internal
4,Guides v1.2,https://docs.twelvelabs.io/v1.2/docs/introduction,yes,Internal


In [40]:
df['url_ok'] = Parallel(n_jobs=32)(delayed(check_url)(url) for url in df['url'])
df['url_ok'].sum()

In [43]:
documents = TrafilaturaWebReader(html_to_text=True).load_data(df[df['url_ok']]['url'].tolist())

In [44]:
index = SummaryIndex.from_documents(documents)

In [45]:
query_engine = CitationQueryEngine.from_args(
    index,
    similarity_top_k=3,
    # here we can control how granular citation sources are, the default is 512
citation_chunk_size=512,
)

In [51]:
retriever = index.as_retriever()
nodes = retriever.retrieve("How does twelve labs technology work?")
len(nodes)

307

In [46]:
response = query_engine.query("How does twelve labs technology work?")
display(Markdown(f"<b>{response}</b>"))


KeyboardInterrupt



In [None]:
response = query_engine.query("What does twelve labs do?")
display(Markdown(f"<b>{response}</b>"))


In [None]:
response = query_engine.query("what is pagination?")
display(Markdown(f"<b>{response}</b>"))


In [None]:
response = query_engine.query("what is the engine object?")
display(Markdown(f"<b>{response}</b>"))


In [24]:
display(Markdown(f"<b>{response}</b>"))


<b>Unfortunately, none of the provided sources contain information about the engine object.</b>

In [49]:
response.source_nodes[11]

NodeWithScore(node=TextNode(id_='5d5f791b-441b-400b-aa79-846030e9d5e3', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='https://docs.twelvelabs.io/docs/playground-classify', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='b0fef5a343244815e6c990b5c2289e74e8c7b60babb04f4c88e0b0d4361f4dce'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='67d559b4-2ae5-4acd-bfd5-d180b94ceb86', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='d4d77c6924efb51f1fc9c171ff86017b50e02531a6983b78e5236d7af6f3b4b7'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='17cb36aa-a009-44e0-90ea-57e978a1ca87', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='554483c8aad89fcff5cdf6b494df43971a0f153c06bfcbee1c1cb5fbf69ca2ff')}, text='Source 12:\nClassify\nThe platform uses a hierarchical structure to classify your videos. Groups of classes form the top level of the structure