# Using LlamaIndex to ingest web domain data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
!{sys.executable} -m pip install -r../requirements.txt

Collecting pypdf (from -r ../requirements.txt (line 15))
  Downloading pypdf-3.9.1-py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.3/249.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pypdf
Successfully installed pypdf-3.9.1


## Setup

First we'll setup our libraries and environment variables

In [3]:
import openai
import os
import requests
import numpy as np
import pandas as pd
from typing import Iterator
import tiktoken
import textract
from numpy import array, average

from database import get_redis_connection

# Set our default models and chunking size
from config import COMPLETIONS_MODEL, EMBEDDINGS_MODEL, CHAT_MODEL, TEXT_EMBEDDING_CHUNK_SIZE, VECTOR_FIELD_NAME

# Ignore unclosed SSL socket warnings - optional in case you get these errors
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ImportWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)  #%% md
# Using LlamaIndex to ingest web domain data

In [4]:
pd.set_option('display.max_colwidth', 0)

## Storage

We're going to use Redis as our database for both document contents and the vector embeddings. You will need the full Redis Stack to enable use of Redisearch, which is the module that allows semantic search - more detail is in the [docs for Redis Stack](https://redis.io/docs/stack/get-started/install/docker/).

To set this up locally, you will need to install Docker and then run the following command: ```docker run -d --name redis-stack -p 6379:6379 -p 8001:8001 redis/redis-stack:latest```.

The code used here draws heavily on [this repo](https://github.com/RedisAI/vecsim-demo).

After setting up the Docker instance of Redis Stack, you can follow the below instructions to initiate a Redis connection and create a Hierarchical Navigable Small World (HNSW) index for semantic search.

In [5]:
# Setup Redis and running?
from database import get_redis_connection

redis_client = get_redis_connection()

redis_client.ping()

ConnectionError: Error 61 connecting to localhost:6379. Connection refused.

In [6]:
# Optional step to drop the index if it already exists
from config import INDEX_NAME

redis_client.ft(INDEX_NAME).dropindex()

b'OK'

## Web Text File Ingestion
Using beautiful_soup_web connector provided by llamahub: https://llamahub.ai/l/web-beautiful_soup_web


In [7]:
from llama_index import download_loader

BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")

loader = BeautifulSoupWebReader()
documents = loader.load_data(urls=['https://focusedlabs.io', 'https://focusedlabs.io/about', 'https://focusedlabs.io/contact', 'https://focusedlabs.io/case-studies',         "https://focusedlabs.io/case-studies/agile-workflow-enabled-btr-automation",
        "https://focusedlabs.io/case-studies/hertz-technology-new-markets",
        "https://focusedlabs.io/case-studies/aperture-agile-transformation",
        "https://focusedlabs.io/case-studies/automated-core-business-functionality" ])
documents

[Document(text='\n\nA digital transformation partner focused on software delivery\n\n\n\n      var show = localStorage.getItem(\'show\');\n      if(show === \'true\'){\n        document.documentElement.classList.add(\'dark\');\n      } \n    \n\nhsjQuery = window[\'jQuery\'];\n\n\n\n\n\na.cta_button{-moz-box-sizing:content-box !important;-webkit-box-sizing:content-box !important;box-sizing:content-box !important;vertical-align:middle}.hs-breadcrumb-menu{list-style-type:none;margin:0px 0px 0px 0px;padding:0px 0px 0px 0px}.hs-breadcrumb-menu-item{float:left;padding:10px 0px 10px 10px}.hs-breadcrumb-menu-divider:before{content:\'›\';padding-left:10px}.hs-featured-image-link{border:0}.hs-featured-image{float:right;margin:0 0 20px 20px;max-width:50%}@media (max-width: 568px){.hs-featured-image{float:none;margin:0;width:100%;max-width:100%}}.hs-screen-reader-text{clip:rect(1px, 1px, 1px, 1px);height:1px;overflow:hidden;position:absolute !important;width:1px}\n\n\n\n\n\n\n\n  \n  .cards_galle

In [8]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [10]:
from transformers import normalize_text

for document in documents:
    document.text = normalize_text(document.text)

In [11]:
from llama_index import GPTVectorStoreIndex
from llama_index.vector_stores import RedisVectorStore
from config import OPENAI_API_KEY, INDEX_NAME, PREFIX
from llama_index.storage.storage_context import StorageContext
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

vector_store = RedisVectorStore(
    index_name=INDEX_NAME,
    index_prefix=PREFIX,
    redis_url="redis://localhost:6379",
    overwrite=True,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GPTVectorStoreIndex.from_documents(documents, storage_context=storage_context)
# index.index_id = INDEX_NAME

vector_store.persist(persist_path="")
# index = GPTVectorStoreIndex.from_documents(documents)

INFO:llama_index.vector_stores.redis:Creating index fl-index
Creating index fl-index
INFO:llama_index.vector_stores.redis:Added 50 documents to index fl-index
Added 50 documents to index fl-index
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 44898 tokens
> [build_index_from_nodes] Total embedding token usage: 44898 tokens
INFO:llama_index.vector_stores.redis:Saving index to disk in background
Saving index to disk in background


In [12]:
# Check that our docs have been inserted
redis_client.ft(INDEX_NAME).info()['num_docs']

'109'

In [20]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What are some of the solutions that Focused Labs has created?")
response
response.response

INFO:llama_index.vector_stores.redis:Using filters: *
Using filters: *
INFO:llama_index.vector_stores.redis:Querying index fl-index
Querying index fl-index
INFO:llama_index.vector_stores.redis:Found 2 results for query with id ['focusedlabsdoc_95c33ff5-2eb0-4dc7-b5ca-40908d2c7e7b', 'focusedlabsdoc_75b0409b-535b-45d5-9a6c-358ff500764b']
Found 2 results for query with id ['focusedlabsdoc_95c33ff5-2eb0-4dc7-b5ca-40908d2c7e7b', 'focusedlabsdoc_75b0409b-535b-45d5-9a6c-358ff500764b']
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 13 tokens
> [retrieve] Total embedding token usage: 13 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 2011 tokens
> [get_response] Total LLM token usage: 2011 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token 

"\nSome of the solutions that Focused Labs has created include streamlining onboarding with BTR Energy's Bridge platform, managing EV charging data, and helping Hertz leverage technology to capture new markets."