In [None]:
import logging
import sys
import openai
from dotenv import load_dotenv, find_dotenv
from llama_index.indices.knowledge_graph.base import GPTKnowledgeGraphIndex
from llama_index import (
    GPTSimpleKeywordTableIndex,
    GPTVectorStoreIndex,
    LLMPredictor,
    LangchainEmbedding,
    OpenAIEmbedding,
    PromptHelper,
    ServiceContext,
    SimpleWebPageReader,
    TrafilaturaWebReader,
    GPTKeywordTableIndex
) 
from langchain.chat_models import ChatOpenAI
from langchain.llms  import AzureOpenAI
from llama_index import StorageContext, load_index_from_storage
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
from langchain.embeddings import OpenAIEmbeddings
import os

#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

_ = load_dotenv(find_dotenv('.env_x'),override=True)  

index = None

 # max LLM token input size
max_input_size = 500
 # set number of output tokens
num_output = 100
# set maximum chunk overlap
max_chunk_overlap = 20 

prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)


if(os.environ['OPENAI_API_TYPE'] == 'azure'):
            openai.api_type = os.environ['OPENAI_API_TYPE']
            openai.api_base = os.environ['OPENAI_API_BASE']
            openai.api_version = os.environ['OPENAI_API_VERSION']
            openai.api_key = os.environ['OPENAI_API_KEY']                    
            llm = AzureOpenAI(deployment_name="newtextmodel")            
            llm_predictor = LLMPredictor(llm=llm)
            #embedding model deployment name has to be "text-embedding-ada-002" on Azure OpenAI      
            embedding_llm = LangchainEmbedding(OpenAIEmbeddings(chunk_size=512))
            prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
            service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embedding_llm, prompt_helper=prompt_helper)
else:
            openai.api_key = os.environ['OPENAI_API_KEY']
            openai.api_base = "https://api.openai.com/v1"
            openai.api_type = os.environ['OPENAI_API_TYPE']
            openai.api_version = ""
            llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"))
            embedding_llm = LangchainEmbedding(OpenAIEmbeddings(chunk_size=4096))
            service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor,embed_model=embedding_llm) 
 
storagePath = './webstorage_SurfaceBookandPro'
dataSource = ["https://support.microsoft.com/en-us/surface/surface-pro-8-update-history-1080bf34-7e87-408c-8619-80571283526e",
              "https://support.microsoft.com/en-us/surface/surface-book-3-update-history-935a7b6b-2f6d-dbf9-b3ba-0ea61e187b2d"]

try:
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir=storagePath)
    # load index
    index = load_index_from_storage(storage_context = storage_context, service_context = service_context)
     
except:
    print("No index found, creating a new one")

urls = dataSource

if(index is None):    
    documents = SimpleWebPageReader(html_to_text=False).load_data(urls)  
    #documents = TrafilaturaWebReader().load_data(urls)
    index = GPTVectorStoreIndex.from_documents(documents,service_context=service_context)
    index.storage_context.persist(persist_dir=storagePath)
 



In [None]:
query_engine = index.as_query_engine(similarity_top_k=5)
deviceName = "Surface System Aggregator - Firmware"
#deviceName = "Surface UEFI - Firmware"
tableFormatPrompt = "All HTML tables <table></table> in the file have two columns. The first column contains driver name and version info. The second column contains device name info. "
versionIinfoPrompt = tableFormatPrompt +"It is very possible that more than one HTML table contains the same device name info. Please search all HTML tables and return all matched rows which contains the same \"" + deviceName + "\" as the second column content. "
#versionFormatPrompt = versionIinfoPrompt + "If the response content from multiple rows, return the content of the driver name and the version info which version info has largest version number."
versionFormatPrompt = versionIinfoPrompt + "Start the response with all matched rows directly, use CSV format."
result = query_engine.query(versionFormatPrompt)


In [None]:
print(result)