# Curate Tech Talks from Events

We're now going to use the power of an LLM with a custom prompt to build our curated corpus.

Here's some principals of how this is going to work:

1. We load our IDs of meetups stored in Parquet
2. For each ID:
    1. We run a query against our event corpus with the metadata filter "doc.id = 'ID'", this limits the data to just that event
    2. We use a custom prompt which:
        * Looks at who was speaking, and what they were speaking about
        * Identifies if this was a paid or free event
        * Outputs the result into a JSON structure which we can leverage
    3. With our JSON in hand, we can now create our curated corpus of tech talks with speaker information.

In [3]:
## Standard Imports and Logging


In [1]:
from vectara_client.core import Factory
from vectara_client.admin import CorpusBuilder
import pandas as pd
import duckdb
import pyarrow as pa
import logging

logging.basicConfig(format='%(asctime)s:%(name)-35s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S %z')
logging.getLogger("OAuthUtil").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

In [2]:
client = Factory().build()
manager = client.corpus_manager

21:40:29 +1000:Factory                             INFO:initializing builder
21:40:29 +1000:Factory                             INFO:Factory will load configuration from home directory
21:40:29 +1000:HomeConfigLoader                    INFO:Loading configuration from users home directory [C:\Users\david]
21:40:29 +1000:HomeConfigLoader                    INFO:Loading default configuration [default]
21:40:29 +1000:HomeConfigLoader                    INFO:Parsing config
21:40:29 +1000:root                                INFO:We are processing authentication type [OAuth2]
21:40:29 +1000:OAuthUtil                           INFO:Using provided OAuth2 URL [https://vectara-prod-1623270172.auth.us-west-2.amazoncognito.com/oauth2/token]
21:40:29 +1000:OAuthUtil                           INFO:OAuth2 URL is [https://vectara-prod-1623270172.auth.us-west-2.amazoncognito.com/oauth2/token]
21:40:29 +1000:root                                INFO:initializing Client


In [35]:
con = duckdb.connect()
con.execute("CREATE TABLE meetups_raw AS SELECT * FROM '../output/meetups_raw.parquet';")

events = con.execute("SELECT * FROM meetups_raw;").fetchdf().to_dict('records')


In [36]:
corpus_id = manager.find_corpus_by_name("AICamp Events")

09:09:05 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 09:09:05.288219
09:09:05 +1000:OAuthUtil                           INFO:Expiry            2024-05-14 22:55:20
09:09:05 +1000:OAuthUtil                           INFO:Token expiry within 5 seconds, refreshing
09:09:07 +1000:OAuthUtil                           INFO:Received OAuth token, will expire [05/15/2024, 10:09:07]
09:09:07 +1000:RequestUtil                         INFO:URL for operation list-corpora is: https://api.vectara.io/v1/list-corpora
09:09:08 +1000:CorpusManager                       INFO:Checking corpus with name [AICamp Events]
09:09:08 +1000:CorpusManager                       INFO:Our corpus id is [744]


In [37]:
from vectara_client.util import render_markdown
from IPython.display import display, Markdown

def render_response(query, response):
    rendered = render_markdown(query, response, show_search_results=False)
    display(Markdown(rendered))

In [41]:
prompt = (
    '[ {"role": "system", "content": "Your job is to summarize search results about meetup tech sessions into JSON. You should use the following'
    ' structure: the outer element should have an array called \\"tech-sessions\\". For each \\"tech-session\\" found, add a title, description and array with speakers. Each speaker should have their name, bio and company in those fields.'
    ' Also include a boolean field whether the event is paid or free. Only use information provided in this chat. Respond in the language denoted by ISO 639 code \\"$vectaraLangCode\\"."}, \n'  # ,\n'
    '#foreach ($qResult in $vectaraQueryResults) \n'
    '   #if ($foreach.first) \n'
    '   {"role": "user", "content": "Search for \\"$esc.java(${vectaraQuery})\\", and give me the first search result."}, \n'
    '   {"role": "assistant", "content": "$esc.java(${qResult.getText()})" }, \n'
    '   #else \n'
    '   {"role": "user", "content": "Give me the \\"$vectaraIdxWord[$foreach.index]\\" search result."}, \n'
    '   {"role": "assistant", "content": "$esc.java(${qResult.getText()})" }, \n'
    '   #end \n'
    ' #end \n'
    '{"role": "user", "content": "Generate a JSON response in the format from the system prompt for the query \\"$esc.java(${vectaraQuery})\\" solely based on the search results in this chat." } ]')


summary_json = None

for event in events:
    id = event['id']
    logger.info(f"Curating event with id {id}")
    response = client.query_service.query("What were the topics discussed at the meetup", corpus_id, summary=True, summarizer="vectara-summary-ext-v1.3.0", promptText = prompt, metadata = f"doc.id = '{id}'")
    summary_json = response.summary[0].text

    if summary_json:
        try:
            summary_dict = json.loads(summary_json)
            event['sessions'] = summary_dict
        except ValueError as e:
            logger.info(f"Unable to generate speaker summary for {event['id']}")
    else:
        logger.info(f"No summary for id {id}")
    

09:13:58 +1000:__main__                            INFO:Curating event with id 296597027
09:13:58 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 09:13:58.203315
09:13:58 +1000:OAuthUtil                           INFO:Expiry            2024-05-15 10:09:07
09:13:58 +1000:OAuthUtil                           INFO:Already authenticated with non-expired token, expiry is [1715731747]
09:13:58 +1000:RequestUtil                         INFO:URL for operation query is: https://api.vectara.io/v1/query
09:14:08 +1000:__main__                            INFO:Curating event with id 295366167
09:14:08 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 09:14:08.392301
09:14:08 +1000:OAuthUtil                           INFO:Expiry            2024-05-15 10:09:07
09:14:08 +1000:OAuthUtil                           INFO:Already authenticated with non-expired token, expiry is [1715731747]
09:14:08 +1000:RequestUtil                         INFO:URL for op

In [33]:
import json

summary_dict = json.loads(summary_json)
logger.info(f"Found Structure:\n{json.dumps(summary_dict, indent=4)}")

22:21:46 +1000:__main__                            INFO:Found Structure:
{
    "tech-sessions": [
        {
            "title": "AI-driven language processing with Vector Search",
            "description": "ChatGPT opened the world to the significant advancement in natural language processing. Asking it questions as if talking to another human, it is able to deduce semantics and context to return relevant information. What if there was a way to retrieve data from your database not with SQL or SQL-like queries but with plain English questions? With a vector database like Cassandra, you can! With vector search, data objects are transformed into embeddings \u2013 vector representations in a high-dimensional space. The embeddings capture the semantic essence of data, enabling more nuanced searches. A vector search retrieves objects which are semantically-related, with an understanding of the context and meaning.",
            "speakers": [
                {
                    "name": "E

In [61]:
valid_sessions = []
events_by_id = {}

for event in events:
    events_by_id[event["id"]] = event
    if "sessions" in event:    
        sessions = event["sessions"]
        if "tech-sessions" in sessions:
            logger.debug(f"Event [{event['id']}] is valid")
            tech_sessions = sessions["tech-sessions"]
            if len(tech_sessions) == 0:
                logger.info(f"tech-sessions is empty in event [{event['id']}]")
            else:
                for session_index, tech_session in enumerate(tech_sessions):
                    if "title" not in tech_session:
                        logger.info(f"In event [{event['id']}], session [{session_index}], title is missing")
                        continue
                    if "description" not in tech_session:
                        logger.info(f"In event [{event['id']}], session [{session_index}], description is missing")
                        continue
                    if "speakers" not in tech_session:
                        logger.info(f"In event [{event['id']}], session [{session_index}], speakers is missing")
                        continue
                    else:
                        speakers = tech_session["speakers"]
                        if len(speakers) == 0:
                            logger.info(f"In event [{event['id']}], session [{session_index}], speakers is empty")
                            tech_session["speakers_valid"] = False
                        else:
                            speakers_valid = True
                            for speaker_index, speaker in enumerate(speakers):
                                if "name" not in speaker:
                                    logger.info(f"In event [{event['id']}], session [{session_index}], speaker [{speaker_index}] name is missing")
                                    speakers_valid = False
                                    break
                                if "bio" not in speaker:
                                    logger.info(f"In event [{event['id']}], session [{session_index}], speaker [{speaker_index}] bio is missing")
                                    speakers_valid = False
                                    break
                                if "company" not in speaker:
                                    logger.info(f"In event [{event['id']}], session [{session_index}], speaker [{speaker_index}] company is missing")
                                    speakers_valid = False
                                    break
                            if speakers_valid:
                                    logger.info(f"In event [{event['id']}], session [{session_index}], speaker [{speaker_index}], Valid Speaker info")
                            tech_session["speakers_valid"] = speakers_valid

                    logger.debug(f"In event [{event['id']}], found valid session [{session_index}]")
                    tech_session["event_id"] = event["id"]
                    tech_session["session_index"] = session_index
                    valid_sessions.append(tech_session)
                    
        else:
            logger.info(f"sessions in event [{event['id']}] is not valid, missing \"tech-sessions\"")
    else:
        logger.info(f"event [{event['id']}] is not valid, missing \"sessions\"")    
        

10:35:55 +1000:__main__                            INFO:In event [296597027], session [0], speaker [0], Valid Speaker info
10:35:55 +1000:__main__                            INFO:In event [295366167], session [0], speaker [7], Valid Speaker info
10:35:55 +1000:__main__                            INFO:event [295512907] is not valid, missing "sessions"
10:35:55 +1000:__main__                            INFO:In event [294543015], session [0], speakers is empty
10:35:55 +1000:__main__                            INFO:In event [294222038], session [0], speaker [2], Valid Speaker info
10:35:55 +1000:__main__                            INFO:In event [293478387], session [0], speakers is empty
10:35:55 +1000:__main__                            INFO:In event [293038498], session [0], speaker [1], Valid Speaker info
10:35:55 +1000:__main__                            INFO:In event [291719089], session [0], speaker [1], Valid Speaker info
10:35:55 +1000:__main__                            INFO:In e

In [87]:
import copy

session_events = []

for event in events:
    event = copy.deepcopy(event)
    
    # Change the sessions back into JSON so we can persist the updated events
    if 'sessions' in event:
        event['sessions_json'] = json.dumps(event['sessions'])
        del event['sessions']

    if 'sessions_json' in event:
        session_events.append(event)

for event in session_events[0:5]:
    logger.info(f"Session object is:\n{event['sessions_json']}")

13:33:00 +1000:__main__                            INFO:Session object is:
{"tech-sessions": [{"title": "In-person AI Meetup - GenAI, LLMs and Vector Search", "description": "The event focused on AI-driven language processing with Vector Search. Participants learned how to perform vector searches on their data out-of-the-box with Cassandra. The event involved checkin, food/drink, networking, tech talks, and an open discussion.", "speakers": [{"name": "Erick Ramirez", "bio": "Erick Ramirez is a known figure in the field of AI-driven language processing and works with DataStax.", "company": "DataStax"}], "isPaid": false, "event_id": "296597027", "speakers_valid": true, "session_index": 0}]}
13:33:00 +1000:__main__                            INFO:Session object is:
{"tech-sessions": [{"title": "Virtual Summit: LLMs and the Generative AI Revolution", "description": "A deep dive into the revolutionary new world of LLMs, agents, auto-healing code, image generators, personalized tutors and mo

In [99]:
import pandas as pd
import duckdb
import pyarrow as pa

columns = []
for key in session_events[0].keys():
    columns.append(key)

logger.info(columns)
    
session_events_df = pd.DataFrame(session_events, columns=columns)
con = duckdb.connect()
con.register('session_events_df',session_events_df)
output_df = con.execute("DESCRIBE session_events_df").fetchdf()
output_df
#session_events_raw = pa.Table.from_pandas(df)


13:42:23 +1000:__main__                            INFO:['id', 'title', 'description', 'event_date', 'event_year', 'event_month', 'url', 'event_type', 'is_online', 'num_going', 'sessions_json']


Unnamed: 0,column_name,column_type,null,key,default,extra
0,id,VARCHAR,YES,,,
1,title,VARCHAR,YES,,,
2,description,VARCHAR,YES,,,
3,event_date,VARCHAR,YES,,,
4,event_year,VARCHAR,YES,,,
5,event_month,VARCHAR,YES,,,
6,url,VARCHAR,YES,,,
7,event_type,VARCHAR,YES,,,
8,is_online,BOOLEAN,YES,,,
9,num_going,BIGINT,YES,,,


In [102]:
from pathlib import Path

output_dir = "../output"
Path(output_dir).mkdir(parents=True, exist_ok=True)
con.execute(f"""COPY
    (SELECT * FROM session_events_df)
    TO '{output_dir}/session_events.parquet'
    (FORMAT 'parquet');""")

<duckdb.duckdb.DuckDBPyConnection at 0x2342bbc1e70>

In [63]:
vectara_documents = []

for session in valid_sessions:
    event = events_by_id[session["event_id"]]

    description = session["description"]
    
    if session["speakers_valid"]:
        speakers = []
        companies = []
    
        # Semantic friendly represnetation
        speaker_infos = []
    
        for speaker_index, speaker in enumerate(session["speakers"]):
            name = speaker["name"]
            company = speaker["company"]
            bio = speaker["bio"]
            
            speakers.append(name)
            companies.append(company)
            speaker_infos.append(f"Speaker {speaker_index+1}: {name}@{company} - {bio}.")

        speaker_fragment = "\n".join(speaker_infos)
        description = f"Title:{event['title']}\nWhen: {event['event_date']}\n{description}\n{speaker_fragment}"
    
    metadata = {
        "event_date": event["event_date"],
        "event_year": event["event_year"],
        "event_month": event["event_month"],
        "event_type": event["event_type"],
        "is_online": event["is_online"],
        "url": event["url"],
        "speaker": speakers,
        "company": companies
    }
    metadata_json = json.dumps(metadata)


    
    to_index = {
      "document_id": f"{event['id']}-{session['session_index']}",
      "title": session["title"],
      "metadata_json": metadata_json,
      "section": [
        {
          "text": description
        }
      ]
    }
    vectara_documents.append(to_index)

In [64]:
for doc in vectara_documents[0:5]:
    logger.info(f"Tech session:\n{json.dumps(doc, indent=4)}")

10:38:16 +1000:__main__                            INFO:Tech session:
{
    "document_id": "296597027-0",
    "title": "In-person AI Meetup - GenAI, LLMs and Vector Search",
    "metadata_json": "{\"event_date\": \"2023/10/17\", \"event_year\": \"2023\", \"event_month\": \"10\", \"event_type\": \"physical\", \"is_online\": false, \"url\": \"https://www.meetup.com/sydney-ai-tech-talks/events/296597027/\", \"speaker\": [\"Erick Ramirez\"], \"company\": [\"DataStax\"]}",
    "section": [
        {
            "text": "Title:In-person AI Meetup - GenAI, LLMs and Vector Search\nWhen: 2023/10/17\nThe event focused on AI-driven language processing with Vector Search. Participants learned how to perform vector searches on their data out-of-the-box with Cassandra. The event involved checkin, food/drink, networking, tech talks, and an open discussion.\nSpeaker 1: Erick Ramirez@DataStax - Erick Ramirez is a known figure in the field of AI-driven language processing and works with DataStax.."
    

In [80]:
from vectara_client.core import Factory
from vectara_client.admin import CorpusBuilder
from vectara_client.domain import Dimension
import logging

logging.basicConfig(format='%(asctime)s:%(name)-35s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S %z')
logger = logging.getLogger(__name__)

client = Factory().build()
manager = client.corpus_manager

corpus = (CorpusBuilder("AICamp Sessions")
         .description("This is where we put our events with their raw description")
         .add_attribute("event_date", "When the event occurred in yyyy-mm-dd format", type="text")
         .add_attribute("event_year", "Which year the event occured")
         .add_attribute("event_month", "Which month the event occurred")
         .add_attribute("event_type", "Delivery format: (online or physical)")
         .add_attribute("is_online", "Whether this was an online event (boolean)", type="boolean")
         .add_attribute("url", "A trackback to meetups.com", indexed=False)
         .add_attribute("speaker", "List of presenters", type="text_list")
         .add_attribute("company", "List of organizations of the Speakers.", type="text_list")
         .build())

going_dim : Dimension("going", "How many attended", "", "")

corpus.customDimensions = [ going_dim ]

corpus_id = manager.create_corpus(corpus, delete_existing=True)

11:08:08 +1000:Factory                             INFO:initializing builder
11:08:08 +1000:Factory                             INFO:Factory will load configuration from home directory
11:08:08 +1000:HomeConfigLoader                    INFO:Loading configuration from users home directory [C:\Users\david]
11:08:08 +1000:HomeConfigLoader                    INFO:Loading default configuration [default]
11:08:08 +1000:HomeConfigLoader                    INFO:Parsing config
11:08:08 +1000:root                                INFO:We are processing authentication type [OAuth2]
11:08:08 +1000:OAuthUtil                           INFO:Using provided OAuth2 URL [https://vectara-prod-1623270172.auth.us-west-2.amazoncognito.com/oauth2/token]
11:08:08 +1000:OAuthUtil                           INFO:OAuth2 URL is [https://vectara-prod-1623270172.auth.us-west-2.amazoncognito.com/oauth2/token]
11:08:08 +1000:root                                INFO:initializing Client
11:08:08 +1000:CorpusManager        

Exception: Unable to create corpus due to: Status(code=<StatusCode.INVALID_ARGUMENT: 3>, statusDetail='Corpus creation failed. Custom dimension name cannot be empty', cause=None)

In [66]:
class SubIndexer:

    def __init__(self, indexer_service, corpus_id):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.indexer_service = indexer_service
        self.corpus_id = corpus_id
        self.docs = []

    def add_doc(self, doc):
        self.docs.append(doc)

    def index_docs(self):
        try:
            for doc in self.docs:
                self.indexer_service.index_doc(self.corpus_id, doc)
        except Exception as e:
            # Ignore for lab
            self.logger("Error: {e}")

thread_count = 10
sub_indexers = [ SubIndexer(client.indexer_service, corpus_id) for x in range(thread_count)]


for index, doc in enumerate(vectara_documents):
    thread_index = index % thread_count
    sub_indexers[thread_index].add_doc(doc)

In [67]:
from threading import Thread

threads = []
for sub_indexer in sub_indexers:
    thread = Thread(target = sub_indexer.index_docs)
    threads.append(thread)
    thread.start()


for index, thread in enumerate(threads):
    logger.info(f"Joining thread {index}")
    thread.join()

10:43:40 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 10:43:40.720973
10:43:40 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 10:43:40.724677
10:43:40 +1000:OAuthUtil                           INFO:Expiry            2024-05-15 11:42:12
10:43:40 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 10:43:40.724677
10:43:40 +1000:OAuthUtil                           INFO:Expiry            2024-05-15 11:42:12
10:43:40 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 10:43:40.731899
10:43:40 +1000:OAuthUtil                           INFO:Expiry            2024-05-15 11:42:12
10:43:40 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 10:43:40.731899
10:43:40 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 10:43:40.731899
10:43:40 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-15 10:43:40.740811
10:43:40 +1