# ELSER the State of the Union

## Make sure ELSER is deployed and started

Full documentation including air gap instructions are here: https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html

But the easiest way to do it is probably in Dev Tools

```PUT _ml/trained_models/.elser_model_1```
```json
{
  "input": {
	"field_names": ["text_field"]
  }
}
```

and then after the ELSER download from elastic's cloud repo is complete

```POST _ml/trained_models/.elser_model_1/deployment/_start?deployment_id=for_search```


## Our State of the union speeches

Let's clean up the text a bit

In [None]:
! pip install icecream

import pickle
from icecream import ic


PICKLE_FILE = "./STATE_OF_THE_UNION.pickle"

speeches = None
with open(PICKLE_FILE, 'rb') as f:
    speeches = pickle.load(f)


print(speeches[0])

In [None]:
## Utility code
def write_strings_to_file(strings, file_path):
    with open(file_path, 'w') as file:
        for line in strings:
            file.write(line + '\n')

def write_docs_to_file(docs, file_path):
    with open(file_path, 'w') as file:
        for doc in docs:
            file.write(doc.page_content + '\n\n')

import json
# pretty printing JSON objects
def json_pretty(input_object):
  print(json.dumps(input_object, indent=4))


import textwrap
# wrap text when printing, because colab scrolls output to the right too much
def wrap_text(text, width):
    wrapped_text = textwrap.wrap(text, width)
    return '\n'.join(wrapped_text)

def print_light_blue(text):
    print(f'\033[94m{text}\033[0m')

In [None]:
import os
from icecream import ic
from dotenv import load_dotenv
load_dotenv(".env", override=True)

from elasticsearch import Elasticsearch

es = None

if 'ELASTIC_CLOUD_ID' in os.environ:
  es = Elasticsearch(
    cloud_id=os.environ['ELASTIC_CLOUD_ID'],
    basic_auth=(os.environ['ELASTIC_USER'], os.environ['ELASTIC_PASSWORD']),
    request_timeout=30
  )
elif 'ELASTIC_URL' in os.environ:
  es = Elasticsearch(
    os.environ['ELASTIC_URL'],
    basic_auth=(os.environ['ELASTIC_USER'], os.environ['ELASTIC_PASSWORD']),
    request_timeout=30
  )
else:
  print("env needs to set either ELASTIC_CLOUD_ID or ELASTIC_URL")

if es:
    ic(es.info()['tagline']) # should return cluster info
    version = ic(es.info()['version']['number'])
    if version < "8.13.0" :
       print("WARNING THIS LAB ASSUMES You are on Elasticsearch Version 8.13 or higher")

In [None]:
! pip install -q langchain==0.1.13 langchain-elasticsearch==0.1.1

from langchain_elasticsearch import ElasticsearchStore

index_name = "sotu_chunks_elser"


### Function to Chunk the data 

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from elasticsearch.exceptions import NotFoundError
from langchain.docstore.document import Document
import re
import copy
from tqdm import tqdm

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 800,
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = True,
)

firstSpeech = speeches[0]


original_dict = {'key1': 'value1', 'key2': ['list', 'elements'], 'key3': {'nested': 'dict'}}
deep_copied_dict = copy.deepcopy(original_dict)

## this function processes a speech and turs it into Langchain Documents
def processSpeech(speech):

    documents = []

    date = speech["date"]
    date_iso = speech["date_iso"]
    url = speech["url"]
    administration = speech["administration"]
    sotu_id = f"{administration}-{date_iso}"
    meta = {
            # "chunk": i,
            "date": date,
            "date_iso": date_iso,
            "url": url,
            "administration": administration,
            "sotu_id": sotu_id
        }    
    
    # print(f"Processing {sotu_id}")

    text = speech["text"].strip()
    text_chunks = text_splitter.split_text(text)

    for i, chunk in enumerate(text_chunks):
        my_meta = copy.deepcopy(meta)
        my_meta["chunk"] = i
        doc = Document(
            page_content=chunk,
            metadata=my_meta
        )
        documents.append(doc)

    return documents


### Index the data 

In [None]:
try:
    es.indices.delete(index=index_name)
except NotFoundError as e:
    print(f"The index '{index_name}' was not found, but that's okay ...moving on")

elastic_vector_search = ElasticsearchStore(
    es_connection=es,
    index_name=index_name,
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(model_id=".elser_model_2_linux-x86_64")
)


docs_inserted = 0

for speech in tqdm(speeches):
    chunkDocuments = processSpeech(speech)

    
    elastic_vector_search.add_documents(
        chunkDocuments,
        bulk_kwargs={
            "chunk_size": 16,
            "max_chunk_bytes": 200000000
        }
    )

    docs_inserted += len(chunkDocuments)
        
    elastic_vector_search.client.indices.refresh(index=index_name)



    

In [None]:
elastic_vector_search.similarity_search(query="What did Biden say about the people of Ukraine?", k=1)

In [None]:
elastic_vector_search.similarity_search(query="What did Clinton say about Ukraine?", k=1)

In [None]:
filter = [{"term": {"metadata.administration.keyword": "Clinton"}}]
elastic_vector_search.similarity_search(query="What did Clinton say about Ukraine?", filter=filter, k=1)