In [6]:

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from dotenv import load_dotenv
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI

load_dotenv()

embed_model = OpenAIEmbedding(model="text-embedding-3-large")
llm = OpenAI()

Settings.llm = llm
Settings.embed_model = embed_model

In [13]:

# Translate html to text
import html2text
import os

def extract_text(html_file_path):
    html = open(html_file_path).read()
    return html2text.html2text(html)

def get_transformed_files():
    file_paths = [f"./data/{file}" for file in os.listdir('./data')]

    out_file_paths = []
    for file_path in file_paths:
        if file_path.endswith('.html'):
            text = extract_text(file_path)
            with open(file_path.replace('.html', '.txt'), 'w') as f:
                f.write(text)
                out_file_paths.append(file_path.replace('.html', '.txt'))
        else:
            out_file_paths.append(file_path)
    return list(set(out_file_paths))

file_paths = get_transformed_files()
documents = SimpleDirectoryReader(input_files=file_paths).load_data()

index = VectorStoreIndex.from_documents(documents)

In [8]:

query_engine = index.as_query_engine()

def ask(question):
    response = query_engine.query(question)
    print(response)
    return response

# ask("What did the author do growing up?")


The author wrote short stories and tried to program on an IBM 1401.


Response(response='The author wrote short stories and tried to program on an IBM 1401.', source_nodes=[NodeWithScore(node=TextNode(id_='925e09e8-15b3-43f1-84e5-6e1d49286f0d', embedding=None, metadata={'file_path': 'data/local_example_tutorial.txt', 'file_name': 'local_example_tutorial.txt', 'file_type': 'text/plain', 'file_size': 100832, 'creation_date': '2024-04-20', 'last_modified_date': '2024-04-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7c1853f6-a307-49fa-b79f-bc7aa30095e7', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data/local_example_tutorial.txt', 'file_name': 'local_example_tutorial.txt', 'file_type': 'text/plain', 'file_size': 100832, 'creation_date': '2024-04-20', 'la

In [16]:
chat_engine = index.as_chat_engine()
chat_engine.chat("""
Your goal is to help me finding things on the documentation I provided. I want your answers to be in this format
(Source: <source>)
<answer>

source: in best case scenario, a URL, or the file name
answer: the answer to the question
""")
def chat(question):
    response = chat_engine.chat(question)
    print(response)
    return response

chat("What's the discord link?")


(Source: Documentation)
The Discord link is: [https://discord.gg/Ekmcn2f3](https://discord.gg/Ekmcn2f3)


AgentChatResponse(response='(Source: Documentation)\nThe Discord link is: [https://discord.gg/Ekmcn2f3](https://discord.gg/Ekmcn2f3)', sources=[ToolOutput(content='[https://discord.gg/Ekmcn2f3](https://discord.gg/Ekmcn2f3)', tool_name='query_engine_tool', raw_input={'input': 'Discord link'}, raw_output=Response(response='[https://discord.gg/Ekmcn2f3](https://discord.gg/Ekmcn2f3)', source_nodes=[NodeWithScore(node=TextNode(id_='58faabb4-2061-4783-9a82-40b6b8eefb73', embedding=None, metadata={'file_path': 'data/hackathon_index.txt', 'file_name': 'hackathon_index.txt', 'file_type': 'text/plain', 'file_size': 20067, 'creation_date': '2024-04-20', 'last_modified_date': '2024-04-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: Relat

In [15]:
chat("what was my previous question?")

(Source: Documentation)
The previous question was "What did the author do growing up?"


AgentChatResponse(response='(Source: Documentation)\nThe previous question was "What did the author do growing up?"', sources=[ToolOutput(content='The previous question was "What did the author do growing up?"', tool_name='query_engine_tool', raw_input={'input': 'What was my previous question?'}, raw_output=Response(response='The previous question was "What did the author do growing up?"', source_nodes=[NodeWithScore(node=TextNode(id_='925e09e8-15b3-43f1-84e5-6e1d49286f0d', embedding=None, metadata={'file_path': 'data/local_example_tutorial.txt', 'file_name': 'local_example_tutorial.txt', 'file_type': 'text/plain', 'file_size': 100832, 'creation_date': '2024-04-20', 'last_modified_date': '2024-04-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelation

In [10]:
chat("How do I set the openai embed model into llama index?")

To set the OpenAI Embed model into Llama Index, you need to ensure that you have the OPENAI_API_KEY set up as an environment variable. This key can be obtained by logging into your OpenAI account and creating a new API key. Once you have the API key, you can integrate the OpenAI `text-embedding-ada-002` model with Llama Index for text retrieval and embeddings.


AgentChatResponse(response='To set the OpenAI Embed model into Llama Index, you need to ensure that you have the OPENAI_API_KEY set up as an environment variable. This key can be obtained by logging into your OpenAI account and creating a new API key. Once you have the API key, you can integrate the OpenAI `text-embedding-ada-002` model with Llama Index for text retrieval and embeddings.', sources=[ToolOutput(content='To set the OpenAI Embed model into Llama Index, you need to ensure that you have the OPENAI_API_KEY set up as an environment variable. This key can be obtained by logging into your OpenAI account and creating a new API key. Once you have the API key, you can integrate the OpenAI `text-embedding-ada-002` model with Llama Index for text retrieval and embeddings.', tool_name='query_engine_tool', raw_input={'input': 'Set the OpenAI Embed model into Llama Index'}, raw_output=Response(response='To set the OpenAI Embed model into Llama Index, you need to ensure that you have the