## Configure connection to Elasticsearch

In [3]:
import os
from dotenv import load_dotenv
load_dotenv(".env", override=True)

from elasticsearch import Elasticsearch

es = None

if 'ELASTIC_CLOUD_ID' in os.environ:
  es = Elasticsearch(
    cloud_id=os.environ['ELASTIC_CLOUD_ID'],
    basic_auth=(os.environ['ELASTIC_USER'], os.environ['ELASTIC_PASSWORD']),
    request_timeout=30
  )
elif 'ELASTIC_URL' in os.environ:
  es = Elasticsearch(
    os.environ['ELASTIC_URL'],
    basic_auth=(os.environ['ELASTIC_USER'], os.environ['ELASTIC_PASSWORD']),
    request_timeout=30
  )
else:
  print("env needs to set either ELASTIC_CLOUD_ID or ELASTIC_URL")

if es:
    print(es.info()['tagline']) # should return cluster info

You Know, for Search


## Retrieve the data from the pickle file

In [4]:
import pickle

PICKLE_FILE = "./STATE_OF_THE_UNION.pickle"

speeches = None
with open(PICKLE_FILE, 'rb') as f:
    speeches = pickle.load(f)

## let's look at the first speech
speeches[0]

{'date': 'February 7, 2023',
 'administration': 'Biden',
 'url': 'https://www.govinfo.gov/content/pkg/CREC-2023-02-07/html/CREC-2023-02-07-pt1-PgS257-2.htm',
 'text': "\n\nCongressional Record, Volume 169 Issue 25 (Tuesday, February 7, 2023)\n\n\n[Congressional Record Volume 169, Number 25 (Tuesday, February 7, 2023)]\n[Senate]\n[Pages S257-S262]\nFrom the Congressional Record Online through the Government Publishing Office [www.gpo.gov]\n\n\n\n\n                          PRESIDENTIAL MESSAGE\n\n                                 ______\n                                 \n\n   REPORT ON THE STATE OF THE UNION DELIVERED TO A JOINT SESSION OF \n                   CONGRESS ON FEBRUARY 7, 2023--PM 1\n\n  The PRESIDING OFFICER laid before the Senate the following message \nfrom the President of the United States which was which was ordered to \nlie on the table:\n\nTo the Congress of the United States:\n  Mr. Speaker. Madam Vice President. Our First Lady and Second \nGentleman. Members of Con

## Insert the first document into a new index without setting the mapping

In [5]:
# Name of the index you're looking for
index_name = 'genai_delete_me'

# let's start fresh
if es.indices.exists(index=index_name):
    print(f"Index '{index_name}' exists. Deleting...")
    # Delete the index
    es.indices.delete(index=index_name)
    print(f"Index '{index_name}' deleted.")

response = es.index(index=index_name, document=speeches[0])
print(f"Document indexed with ID: {response['_id']}")

Document indexed with ID: CoCuzIoBsv39V8zkWYzW


It looks like things worked fine, but let's look at the mapping that was created

In [11]:
import json
# pretty printing JSON objects
def json_pretty(input_object):
  print(json.dumps(input_object, indent=4))

mapping = es.indices.get_mapping(index=index_name)
json_pretty(mapping.body)

{
    "genai_delete_me": {
        "mappings": {
            "properties": {
                "administration": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "date": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                        }
                    }
                },
                "date_iso": {
                    "type": "date"
                },
                "text": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 256
                 

When Elasticsearch gets index requests without any preparation or schema it guesses at the field types based on the first document received. This can be problematic

You may need to expand the output to see the full mapping that was created. Observe the following

```json
"text": {
    "type": "text",
    "fields": {
        "keyword": {
            "type": "keyword",
            "ignore_above": 256
        }
    }
}
```

The actual speech is being indexed as both full text and a keyword. The keyword mapping is both truncated and wasteful.  

```json
"date": {
    "type": "text",
    "fields": {
        "keyword": {
            "type": "keyword",
            "ignore_above": 256
        }
    }
},
"date_iso": {
    "type": "date"
}
```

The first date with values like ```'date': 'February 7, 2023'``` is a guessed as a string and gets the server default mapping.  The keyword value here may actually be useful for fast search faceting as it is a unique value as long as it is spelled and capitalized correctly across all records.

Let's do this again with an explicit mapping so that we are controlling how the data is indexed and not relying on the luck of Elasticsearch guessing mappings.

In [17]:
# let's start fresh
def delete_index(index_name):
    if es.indices.exists(index=index_name):
        print(f"Index '{index_name}' exists. Deleting...")
        # Delete the index
        es.indices.delete(index=index_name)
        print(f"Index '{index_name}' deleted.")

def create_index_with_mapping(index_name, properties):
    # Check if the index exists, and if not, create it
    if not es.indices.exists(index=index_name):
        es.indices.create(index=index_name)
    
    response = es.indices.put_mapping(properties=properties, index=index_name, )
    # Define your mapping

properties = {
            "administration":   {"type": "keyword"},
            "date":             {"type": "keyword"},
            "date_iso":         {"type": "date"},
            "text":             {"type": "text"},
            "url":  {
                        "type": "text",
                        "fields": {
                            "keyword": {
                                "type": "keyword",
                                "ignore_above": 1024
                            }
                        }
                    }              
        }

delete_index(index_name=index_name)
create_index_with_mapping(index_name=index_name, properties=properties)

response = es.index(index=index_name, document=speeches[0])
print(f"Document indexed with ID: {response['_id']}")
mapping = es.indices.get_mapping(index=index_name)
json_pretty(mapping.body)

## clean up
delete_index(index_name=index_name)


Document indexed with ID: hR7RzIoBz9aOWw2c116t
{
    "genai_state_of_the_union": {
        "mappings": {
            "properties": {
                "administration": {
                    "type": "keyword"
                },
                "date": {
                    "type": "keyword"
                },
                "date_iso": {
                    "type": "date"
                },
                "text": {
                    "type": "text"
                },
                "url": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 1024
                        }
                    }
                }
            }
        }
    }
}
Index 'genai_state_of_the_union' exists. Deleting...
Index 'genai_state_of_the_union' deleted.


## Batch inserting all of the documents

Now we'll use the batch insertion commands of the elasticsearch Python library to insert documents into Elasticsearch.

Batch insertion is more efficient than inserting documents one at a time. If you need to go faster then adjusting the index settings to prevent near real-time refreshes during a big batch insert will get you more preformance.  At this scale, we don't care.

In [18]:
from elasticsearch import Elasticsearch, helpers
from tqdm import tqdm

index_name = "genai_state_of_the_union"

delete_index(index_name=index_name)
create_index_with_mapping(index_name=index_name, properties=properties)

BATCH_SIZE = 10  # Set your desired batch size here

def batchify(docs, batch_size):
    for i in range(0, len(docs), batch_size):
        yield docs[i:i + batch_size]

def bulkLoadIndex(index_name, json_docs ):
    batches = list(batchify(json_docs, BATCH_SIZE))

    for batch in tqdm(batches, desc=f"Batches of size {BATCH_SIZE}"):
        # Convert the JSON documents to the format required for bulk insertion
        bulk_docs = [
            {
                "_op_type": "index",
                "_index": index_name,
                "_source": doc
            }
            for doc in batch
        ]

        # Perform bulk insertion
        success, errors =  helpers.bulk(es, bulk_docs, raise_on_error=False)
        if errors:
            for error in errors:
                print(error)

bulkLoadIndex(index_name=index_name, json_docs=speeches)

Batches of size 10: 100%|██████████| 4/4 [00:00<00:00,  6.20it/s]


Okay let's now retrieve some documents using a search

In [28]:
query = {
    "query_string": {
      "query": "Ukraine",
      "default_field": "*"
    }
  }
source_fields = ["administration","url","date"]
size = 5
response = es.search(
    index=index_name, 
    query=query, 
    source=source_fields,
    size=size)
json_pretty(response["hits"])

{
    "total": {
        "value": 11,
        "relation": "eq"
    },
    "max_score": 2.035987,
    "hits": [
        {
            "_index": "genai_state_of_the_union",
            "_id": "hx7SzIoBz9aOWw2cTF5M",
            "_score": 2.035987,
            "_source": {
                "date": "March 1, 2022",
                "administration": "Biden",
                "url": "https://www.govinfo.gov/content/pkg/DCPD-202200127/html/DCPD-202200127.htm"
            }
        },
        {
            "_index": "genai_state_of_the_union",
            "_id": "mB7SzIoBz9aOWw2cTV4z",
            "_score": 1.5143226,
            "_source": {
                "date": "February 7, 2005",
                "administration": "Bush43",
                "url": "https://www.govinfo.gov/content/pkg/WCPD-2005-02-07/html/WCPD-2005-02-07-Pg126.htm"
            }
        },
        {
            "_index": "genai_state_of_the_union",
            "_id": "jR7SzIoBz9aOWw2cTF5M",
            "_score": 1.4489859,
  

## We'll learn more about Streamlit

but we won't do it in the python notebook.  Let's go back to the README file for this week

