# Building a personal search engine with llama-index

In [1]:
# imports

In [2]:
import sys
from pathlib import Path

from loguru import logger
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage, download_loader

In [3]:
# log to file

In [4]:
logger.add("tutorial.log", level="DEBUG")

1

## 1. Ingesting data

In [5]:
def load_data():
    # tutorial: load data from data/ folder
    documents = SimpleDirectoryReader('data').load_data()
    return documents

documents = load_data()

## 2. Chunking data

In [6]:
# default stuff is fine

## 3. Building an index

In [7]:
def build_index(documents):
    logger.info("Building a GPTVectorStore from documents")
    # create a vector store index
    # uses OpenAI API by default!
    index = GPTVectorStoreIndex.from_documents(documents)

    # store index to disk
    index.storage_context.persist()

    return index

## 4. Querying the index

In [8]:
def query(index, question="What does Rumelt write about industry dynamics?"):
    # query the new index with a question
    # uses OpenAI API by default
    query_engine = index.as_query_engine()
    logger.info(f"Querying index with question {question}")
    response = query_engine.query(question)
    logger.info(f"Received response: {response.response.strip()}")
    
    return response

In [9]:
documents

[Document(id_='9e13c83b-1af7-466e-8c4c-6c3c3c6f05ce', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='5bad56206015af5e33bed0782e68fa9023cf08be19be8d26330dd650ebdd105f', text="\n\nOmissions\n\n- In general, our investigation has centered on commercially usable models. There are a lot of fine-tuned llamas that have been omitted, but they are generally not that different.\n- **Unaligned chat/instruct models**: not a big research focus, as they are all based on llama. However, they may be useful in case you're hitting “Sorry, as an AI language model..” a lot.\n\n", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='5863df73-c115-4821-8db0-a4e39b4ad09c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='8119925119158186114c2c741a6174f943fd0445161a5