# Part 1: Querying

PyData Amsterdam 2023

* Tutorial: Building a personal search engine with llama-index
* Speakers: Judith van Stegeren and Yorick van Pelt
* Company: [Datakami](www.datakami.nl)

In [None]:
# imports
import pprint # dev
import json
import os
import sys
from pathlib import Path

# loguru: logging for lazy people :)
from loguru import logger

# llama_index: the topic of this tutorial
# we're not importing specific methods or classes so it's clear when we actually call llama_index!
import llama_index

In [None]:
from secret import openai_api_key

In [None]:
# log to stdout and local file
logger.remove()
logger.add(sys.stdout, format="{time} - {level} - {message}", level="DEBUG")
logger.add("tutorial_part_1.log", level="DEBUG")

2

In [None]:
# constants
DATA_PATH = Path("data/pydata/schedule.json")
INDEX_PATH = Path("indices/pydata_schedule_index/")

## Setup

### Use a local embeddings model

(So no calls to OpenAI APIs :))

In [None]:
llm = llama_index.llms.OpenAI(model="gpt-3.5-turbo", api_key=openai_api_key) # todo: requires API key

In [None]:
# all-minilm-l6-v2 has a maximum size of 256 tokens
# source: https://www.sbert.net/docs/pretrained_models.html#model-overview
service_context = llama_index.ServiceContext.from_defaults(
  embed_model="local:sentence-transformers/all-minilm-l6-v2", chunk_size=256, llm=llm
)

In [None]:
llama_index.global_service_context = service_context

### Load PyData schedule

**Load JSON file with the PyData Amsterdam 2023 schedule**
* source: https://amsterdam2023.pydata.org/cfp/schedule/export/schedule.json
* retrieved: 2023-08-10

In [None]:
with open(DATA_PATH, 'r') as infile:
    schedule = json.load(infile)
    logger.info(f"Loaded the PyData schedule JSON from file {DATA_PATH}")

2023-08-23T13:42:40.736374+0200 - INFO - Loaded the PyData schedule JSON from file data/pydata/schedule.json


**Extract the talks from the schedule**

In [None]:
talks = {}
for day in schedule['schedule']['conference']['days']:
    for room in day['rooms'].values():
        for talk in room:
            talk['filename'] = str(DATA_PATH)
            talk['category'] = "Conference talk at PyData Amsterdam 2023"
            talks[talk['guid']] = talk

logger.info(f"Loaded {len(talks)} talks from the PyData schedule JSON!")

2023-08-23T15:45:24.710455+0200 - INFO - Loaded 67 talks from the PyData schedule JSON!


In [None]:
print("Example of a PyData talk:")
pprint.pprint(list(talks.values())[12])

Example of a PyData talk:
{'abstract': 'Lorem ipsum dolor',
 'answers': [],
 'attachments': [],
 'category': 'Conference talk at PyData Amsterdam 2023',
 'date': '2023-09-14T09:30:00+02:00',
 'description': 'Lorem ipsum dolor',
 'do_not_record': False,
 'duration': '00:50',
 'filename': 'data/pydata/schedule.json',
 'guid': 'e82f37c8-03f9-5cb5-92f2-8f157924b59d',
 'id': 267,
 'language': 'en',
 'links': [],
 'logo': '',
 'persons': [{'answers': [],
              'biography': 'Vicki Boykis works on end-to-end ML applications. '
                           'Her interests include the intersection of '
                           'information retrieval and large language models, '
                           'applying engineering best practices to machine '
                           'learning, and Nutella.  She works at Duo Security '
                           'and she lives in Philadelphia with her family. Her '
                           'favorite hobby was making terrible jokes on '
    

**Turn the talks data into llama_index Documents**

In [None]:
documents = []
for talk in talks.values():
    talk_text = f"{talk['title']}\n\n{talk['abstract']}\n\n{talk['description']}"
    doc = llama_index.Document(text = talk_text, id_ = talk["guid"])
    documents.append(doc)

In [None]:
print("Example of a PyData talk Document:")
pprint.pprint(dict(documents[12]))

Example of a PyData talk Document:
{'embedding': None,
 'end_char_idx': None,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'hash': 'baf26eaa727340ec583dbc2b3d661646147352bb711318674ab72326813efbe6',
 'id_': 'e82f37c8-03f9-5cb5-92f2-8f157924b59d',
 'metadata': {},
 'metadata_seperator': '\n',
 'metadata_template': '{key}: {value}',
 'relationships': {},
 'start_char_idx': None,
 'text': 'Keynote Vicki Boykis\n\nLorem ipsum dolor\n\nLorem ipsum dolor',
 'text_template': '{metadata_str}\n\n{content}'}


### Create vector index from PyData schedule

In [None]:
# create vector index from PyData schedule
logger.info(f"Building a VectorStoreIndex from {len(documents)} documents")
index = llama_index.VectorStoreIndex.from_documents(documents, service_context=service_context)

# store index to disk
index.storage_context.persist(INDEX_PATH)
logger.info(f"Saved VectorStoreIndex to {INDEX_PATH}")

2023-08-23T15:45:42.942570+0200 - INFO - Building a VectorStoreIndex from 67 documents
2023-08-23T15:45:46.678834+0200 - INFO - Saved VectorStoreIndex to indices/pydata_schedule_index


## Load vector index with PyData Amsterdam 2023 schedule

In [None]:
# load vector index from file
if not os.path.exists(INDEX_PATH):
    logger.error("Index file for part 1 does not exist on disk. :(")
else:
    try:                                                                             
        # rebuild storage context from disk                                          
        storage_context = llama_index.StorageContext.from_defaults(persist_dir=INDEX_PATH)
        # load index                                                                 
        #index = llama_index.load_index_from_storage(storage_context, service_context=service_context)
        index = llama_index.load_index_from_storage(storage_context)
        logger.info("Loaded index from local storage")                               
    except Exception as e:                                                           
        logger.error(e) 

2023-08-23T15:45:46.922309+0200 - INFO - Loaded index from local storage


## Create a search engine from vector index

In [None]:
# create a search engine
retriever = index.as_retriever()

## Query the search engine

In [None]:
# query the search engine
results = retriever.retrieve("llama_index")
for result in results:
    talk = talks[result.node.source_node.node_id]
    print(f"- score: {round(result.score, 2)} title: _{talk['title']}_")

- score: 0.35 title: _Building a personal search engine with llama-index_
- score: 0.24 title: _Unconference #1_


### Startups

In [None]:
results = retriever.retrieve("startups")
for result in results:
    talk = talks[result.node.source_node.node_id]
    print(f"- score: {round(result.score, 2)} title: _{talk['title']}_")

- score: 0.32 title: _Kickstart AI sponsored drinks [time & location TBD]_
- score: 0.26 title: _Power Users, Long Tail Users, and Everything In Between: Choosing Meaningful Metrics and KPIs for Product Strategy_


## Querying the vector index with an external LLM

In [None]:
import openai

In [None]:
query_engine = index.as_query_engine()

In [None]:
try:
    response = query_engine.query("Which talks are probably interesting for startup founders?")
except openai.error.AuthenticationError as auth_error:
    logger.error(auth_error)

In [None]:
response.response

'The talks that are probably interesting for startup founders are "Setting The Right KPIs" and "Data-Driven Decision Making". These talks discuss topics such as setting realistic and challenging KPIs and leveraging data for informed decision-making and product strategy adjustments, which are important for startup founders in shaping their product strategy and making data-driven decisions.'

In [None]:
print("Sources:")
for source in response.source_nodes:
    print("-", talks[source.node.source_node.node_id]['title'])

Sources:
- Kickstart AI sponsored drinks [time & location TBD]
- Power Users, Long Tail Users, and Everything In Between: Choosing Meaningful Metrics and KPIs for Product Strategy
