# Build and Load the Event Corpus
We'll now build our event corpus, this is where we'll put events with their full description. This contains the "raw" description before we run a curation step to extract the talks provided.

We'll re-use the Parquet from the previous slide.

A key thing for this corpus is to include five filter attributes, which demonstrate the power of Semantic Search combined with Key-Value searches. We will define four filter attributes below:

* **event_date:** When the event occurred in yyyy-mm-dd format
* **event_year:** Which year the event occured
* **event_month:** Which month the event occurred
* **event_type:** Delivery format (online or physical)
* **is_online:** Whether this was an online event (boolean)
* **url:** A trackback to meetup.com

In [21]:
from vectara_client.core import Factory
from vectara_client.admin import CorpusBuilder
import logging

logging.basicConfig(format='%(asctime)s:%(name)-35s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S %z')
logging.getLogger("OAuthUtil").setLevel(logging.WARNING)
logger = logging.getLogger(__name__)

client = Factory().build()
manager = client.corpus_manager

corpus = (CorpusBuilder("AICamp Events")
         .description("This is where we put our events with their raw description")
         .add_attribute("event_date", "When the event occurred in yyyy-mm-dd format", type="text")
         .add_attribute("event_year", "Which year the event occured")
         .add_attribute("event_month", "Which month the event occurred")
         .add_attribute("event_type", "Delivery format: (online or physical)")
         .add_attribute("is_online", "Whether this was an online event (boolean)", type="boolean")
         .add_attribute("url", "A trackback to meetups.com", indexed=False)
          
         # We can't add these yet, we need to do this in our curated corpus.
         #.add_attribute("presenters", "List of presenters", type="text_list")
         #.add_attribute("organization", "List of organizations of the Speakers.", type="text_list")
         .build())

corpus_id = manager.create_corpus(corpus, delete_existing=True)

21:11:11 +1000:Factory                             INFO:initializing builder
21:11:11 +1000:Factory                             INFO:Factory will load configuration from home directory
21:11:11 +1000:HomeConfigLoader                    INFO:Loading configuration from users home directory [C:\Users\david]
21:11:11 +1000:HomeConfigLoader                    INFO:Loading default configuration [default]
21:11:11 +1000:HomeConfigLoader                    INFO:Parsing config
21:11:11 +1000:root                                INFO:We are processing authentication type [OAuth2]
21:11:11 +1000:OAuthUtil                           INFO:Using provided OAuth2 URL [https://vectara-prod-1623270172.auth.us-west-2.amazoncognito.com/oauth2/token]
21:11:11 +1000:OAuthUtil                           INFO:OAuth2 URL is [https://vectara-prod-1623270172.auth.us-west-2.amazoncognito.com/oauth2/token]
21:11:11 +1000:root                                INFO:initializing Client
21:11:11 +1000:CorpusManager        

In [23]:
import pandas as pd
import duckdb
import pyarrow as pa

con = duckdb.connect()
con.execute("CREATE TABLE meetups_raw AS SELECT * FROM '../output/meetups_raw.parquet';")

description_df = con.execute("DESCRIBE meetups_raw;").fetchdf()

In [24]:
description_df

Unnamed: 0,column_name,column_type,null,key,default,extra
0,id,VARCHAR,YES,,,
1,title,VARCHAR,YES,,,
2,description,VARCHAR,YES,,,
3,event_date,VARCHAR,YES,,,
4,event_year,VARCHAR,YES,,,
5,event_month,VARCHAR,YES,,,
6,url,VARCHAR,YES,,,
7,event_type,VARCHAR,YES,,,
8,is_online,BOOLEAN,YES,,,
9,num_going,BIGINT,YES,,,


In [29]:
import json

events_df = con.execute("SELECT * FROM meetups_raw;").fetchdf()
events = events_df.to_dict('records')

vectara_documents = []

for event in events:
    metadata = {
        "event_date": event["event_date"],
        "event_year": event["event_year"],
        "event_month": event["event_month"],
        "event_type": event["event_type"],
        "is_online": event["is_online"],
        "url": event["url"]
    }
    metadata_json = json.dumps(metadata)
    
    to_index = {
      "document_id": event["id"],
      "title": event["title"],
      "metadata_json": metadata_json,
      "section": [
        {
          "text": event["description"]
        }
      ]
    }
    vectara_documents.append(to_index)

#logger.info(f"About to index:\n{json.dumps(to_index,indent=4)}")
#client.indexer_service.index_doc(corpus_id, to_index)
    

In [33]:
class SubIndexer:

    def __init__(self, indexer_service, corpus_id):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.indexer_service = indexer_service
        self.corpus_id = corpus_id
        self.docs = []

    def add_doc(self, doc):
        self.docs.append(doc)

    def index_docs(self):
        try:
            for doc in self.docs:
                self.indexer_service.index_doc(self.corpus_id, doc)
        except Exception as e:
            # Ignore for lab
            self.logger("Error: {e}")

thread_count = 10
sub_indexers = [ SubIndexer(client.indexer_service, corpus_id) for x in range(thread_count)]


for index, doc in enumerate(vectara_documents):
    thread_index = index % thread_count
    sub_indexers[thread_index].add_doc(doc)



In [35]:
from threading import Thread

threads = []
for sub_indexer in sub_indexers:
    thread = Thread(target = sub_indexer.index_docs)
    threads.append(thread)
    thread.start()


for index, thread in enumerate(threads):
    logger.info(f"Joining thread {index}")
    thread.join()
    


21:30:22 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-14 21:30:22.069200
21:30:22 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-14 21:30:22.075199
21:30:22 +1000:OAuthUtil                           INFO:Expiry            2024-05-14 22:11:12
21:30:22 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-14 21:30:22.079199
21:30:22 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-14 21:30:22.080201
21:30:22 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-14 21:30:22.081202
21:30:22 +1000:OAuthUtil                           INFO:Expiry            2024-05-14 22:11:12
21:30:22 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-14 21:30:22.082200
21:30:22 +1000:OAuthUtil                           INFO:Current timestamp 2024-05-14 21:30:22.083199
21:30:22 +1000:OAuthUtil                           INFO:Already authenticated with non-expired token, exp