# Part 1: Querying

PyData Amsterdam 2023

* Tutorial: Building a personal search engine with llama-index
* Speakers: Judith van Stegeren and Yorick van Pelt
* Company: [Datakami](www.datakami.nl)

In [1]:
# imports
import pprint # dev
import json
import os
import sys
from pathlib import Path

# loguru: logging for lazy people :)
from loguru import logger

# we're using a local embeddings model
from sentence_transformers import SentenceTransformer

# llama_index: the topic of this tutorial
# we're not importing specific methods or classes so it's clear when we actually call llama_index!
import llama_index

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from secret import openai_api_key

In [3]:
# log to stdout and local file
logger.remove()
logger.add(sys.stdout, format="{time} - {level} - {message}", level="DEBUG")
logger.add("tutorial_part_1.log", level="DEBUG")

2

In [4]:
# constants
DATA_PATH = Path("data/pydata/schedule.json")
INDEX_PATH = Path("indices/pydata_schedule_index/")

## Setup

### Use a local embeddings model

(So no calls to OpenAI APIs :))

In [5]:
llm =llama_index.llms.OpenAI(model="gpt-3.5-turbo", api_key=openai_api_key)

In [6]:
# all-minilm-l6-v2 has a maximum size of 256 tokens
# source: https://www.sbert.net/docs/pretrained_models.html#model-overview
service_context = llama_index.ServiceContext.from_defaults(
  embed_model="local:sentence-transformers/all-minilm-l6-v2", chunk_size=256, llm=llm
)

In [7]:
llama_index.global_service_context = service_context

### Load PyData schedule

**Load JSON file with the PyData Amsterdam 2023 schedule**
* source: https://amsterdam2023.pydata.org/cfp/schedule/export/schedule.json
* retrieved: 2023-08-10

In [17]:
with open(DATA_PATH, 'r') as infile:
    schedule = json.loads(infile.read())
    logger.info(f"Loaded the PyData schedule JSON from file {DATA_PATH}")

2023-08-22T12:08:14.040315+0200 - INFO - Loaded the PyData schedule JSON from file data/pydata/schedule.json


**Extract the talks from the schedule**

In [18]:
documents = []
talks = []
for day in schedule['schedule']['conference']['days']:
    for room in day['rooms']:
        for talk in day['rooms'][room]:
            talk['filename'] = str(DATA_PATH)
            talk['category'] = "Conference talk at PyData Amsterdam 2023"
            talks.append(talk)

logger.info(f"Loaded {len(talks)} talks from the PyData schedule JSON!")

2023-08-22T12:08:15.480534+0200 - INFO - Loaded 67 talks from the PyData schedule JSON!


In [19]:
print("Example of a PyData talk:")
pprint.pprint(talks[12])

Example of a PyData talk:
{'abstract': 'Lorem ipsum dolor',
 'answers': [],
 'attachments': [],
 'category': 'Conference talk at PyData Amsterdam 2023',
 'date': '2023-09-14T09:30:00+02:00',
 'description': 'Lorem ipsum dolor',
 'do_not_record': False,
 'duration': '00:50',
 'filename': 'data/pydata/schedule.json',
 'guid': 'e82f37c8-03f9-5cb5-92f2-8f157924b59d',
 'id': 267,
 'language': 'en',
 'links': [],
 'logo': '',
 'persons': [{'answers': [],
              'biography': 'Vicki Boykis works on end-to-end ML applications. '
                           'Her interests include the intersection of '
                           'information retrieval and large language models, '
                           'applying engineering best practices to machine '
                           'learning, and Nutella.  She works at Duo Security '
                           'and she lives in Philadelphia with her family. Her '
                           'favorite hobby was making terrible jokes on '
    

**Turn the talks data into llama_index Documents**

In [20]:
documents = []
for talk in talks:
    talk_text = f"{talk['title']}\n\n{talk['abstract']}\n\n{talk['description']}"
    doc = llama_index.Document(text = talk_text)
    #doc.extra_info = talk
    documents.append(doc)

In [21]:
print("Example of a PyData talk Document:")
pprint.pprint(dict(documents[12]))

Example of a PyData talk Document:
{'embedding': None,
 'end_char_idx': None,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'hash': 'baf26eaa727340ec583dbc2b3d661646147352bb711318674ab72326813efbe6',
 'id_': 'c8268bf8-c1d4-4420-b345-7094bd1eb19b',
 'metadata': {},
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'relationships': {},
 'start_char_idx': None,
 'text': 'Keynote Vicki Boykis\n\nLorem ipsum dolor\n\nLorem ipsum dolor',
 'text_template': '{metadata_str}\n\n{content}'}


### Create vector index from PyData schedule

In [22]:
# create vector index from PyData schedule
logger.info(f"Building a VectorStoreIndex from {len(documents)} documents")
index = llama_index.VectorStoreIndex.from_documents(documents, service_context=service_context)

# store index to disk
index.storage_context.persist(INDEX_PATH)
logger.info(f"Saved VectorStoreIndex to {INDEX_PATH}")

2023-08-22T12:08:21.411280+0200 - INFO - Building a VectorStoreIndex from 67 documents
2023-08-22T12:08:32.213701+0200 - INFO - Saved VectorStoreIndex to indices/pydata_schedule_index


## Load vector index with PyData Amsterdam 2023 schedule

In [8]:
# load vector index from file
if not os.path.exists(INDEX_PATH):
    logger.error("Index file for part 1 does not exist on disk. :(")
else:
    try:                                                                             
        # rebuild storage context from disk                                          
        storage_context = llama_index.StorageContext.from_defaults(persist_dir=INDEX_PATH)
        # load index                                                                 
        #index = llama_index.load_index_from_storage(storage_context, service_context=service_context)
        index = llama_index.load_index_from_storage(storage_context)
        logger.info("Loaded index from local storage")                               
    except Exception as e:                                                           
        logger.error(e) 

2023-08-22T12:12:27.556688+0200 - INFO - Loaded index from local storage


## Create a search engine from vector index

In [9]:
# create a search engine
retriever = index.as_retriever()

## Query the search engine

In [10]:
# query the search engine
results = retriever.retrieve("llama_index")

In [11]:
for result in results:
    print(result.node)
    print("---")
    print()

id_='97863ede-b6a9-4fc2-8c7d-bf09017fdf64' embedding=None metadata={} excluded_embed_metadata_keys=[] excluded_llm_metadata_keys=[] relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0a4e1eca-81a8-4212-a397-a25a3e12a12c', node_type=None, metadata={}, hash='06987f3dd83522f1b914565902298eb9eed00f77d5fa47d3d52c5f4301959f54'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='ac962050-8632-48fc-8be6-5bffc2c2eef9', node_type=None, metadata={}, hash='3afcb14eb9ca291a920e6ec82ff895103024cb798781880f8f09ee4e80c725c0')} hash='c852556384af48ec95a06b58d93753903d7a40cf65713b2ce35ef85583de0ac8' text='Building a personal search engine with llama-index\n\nWouldn’t it be great to have a Google-like search engine, but then for your own text files and completely private?In this tutorial we’ll build a small personal search engine using open source library llama-index.In this tutorial we will build a small personal search engine using open source library `llama-index`.Llama-index

In [12]:
results = retriever.retrieve("startups")
for result in results:
    print(result.node)
    print("---")
    print()

id_='e032a29f-3bdf-479e-9cf7-965f14d55f50' embedding=None metadata={} excluded_embed_metadata_keys=[] excluded_llm_metadata_keys=[] relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6d2f4965-8bb3-4e9a-9f7f-47799dedee38', node_type=None, metadata={}, hash='5f9805edfef93dacfcb40f26094eed9f7811fed2523d7810e2606e74e945155b')} hash='5f9805edfef93dacfcb40f26094eed9f7811fed2523d7810e2606e74e945155b' text='Kickstart AI sponsored drinks [time & location TBD]\n\nKickstart AI is a foundation powered by a coalition of iconic Dutch brands (Ahold Delhaize, ING, KLM and NS). Their mission is to accelerate AI adoption in the Netherlands, and improve society through the use of AI.\n\nLorem ipsum dolor' start_char_idx=None end_char_idx=None text_template='{metadata_str}\n\n{content}' metadata_template='{key}: {value}' metadata_seperator='\n'
---

id_='764c2331-2c5f-414a-8a97-fe614114bcd3' embedding=None metadata={} excluded_embed_metadata_keys=[] excluded_llm_metadata_keys=[] relat

## Querying the vector index with an external LLM

In [13]:
import openai

In [20]:
query_engine = index.as_query_engine()

In [18]:
try:
    response = query_engine.query("Which talks are probably interesting for startup founders?")
except openai.error.AuthenticationError as auth_error:
    logger.error(auth_error)

2023-08-22T12:12:55.725596+0200 - ERROR - No API key provided. You can set your API key in code using 'openai.api_key = <API-KEY>', or you can set the environment variable OPENAI_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the openai module at it with 'openai.api_key_path = <PATH>'. You can generate API keys in the OpenAI web interface. See https://platform.openai.com/account/api-keys for details.


In [19]:
response.response

NameError: name 'response' is not defined