# ELSER the State of the Union

## Make sure ELSER is deployed and started

Full documentation including air gap instructions are here: https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html

But the easiest way to do it is probably in Dev Tools

```PUT _ml/trained_models/.elser_model_1```
```json
{
  "input": {
	"field_names": ["text_field"]
  }
}
```

and then after the ELSER download from elastic's cloud repo is complete

```POST _ml/trained_models/.elser_model_1/deployment/_start?deployment_id=for_search```


## Our State of the union speeches

Let's clean up the text a bit

In [2]:
import pickle
from icecream import ic


PICKLE_FILE = "./STATE_OF_THE_UNION.pickle"

speeches = None
with open(PICKLE_FILE, 'rb') as f:
    speeches = pickle.load(f)



In [3]:
## Utility code
def write_strings_to_file(strings, file_path):
    with open(file_path, 'w') as file:
        for line in strings:
            file.write(line + '\n')

def write_docs_to_file(docs, file_path):
    with open(file_path, 'w') as file:
        for doc in docs:
            file.write(doc.page_content + '\n\n')

In [4]:
import os
from icecream import ic
from dotenv import load_dotenv
load_dotenv(".env", override=True)

from elasticsearch import Elasticsearch

es = None

if 'ELASTIC_CLOUD_ID' in os.environ:
  es = Elasticsearch(
    cloud_id=os.environ['ELASTIC_CLOUD_ID'],
    basic_auth=(os.environ['ELASTIC_USER'], os.environ['ELASTIC_PASSWORD']),
    request_timeout=30
  )
elif 'ELASTIC_URL' in os.environ:
  es = Elasticsearch(
    os.environ['ELASTIC_URL'],
    basic_auth=(os.environ['ELASTIC_USER'], os.environ['ELASTIC_PASSWORD']),
    request_timeout=30
  )
else:
  print("env needs to set either ELASTIC_CLOUD_ID or ELASTIC_URL")

if es:
    ic(es.info()['tagline']) # should return cluster info

ic| es.info()['tagline']: 'You Know, for Search'


In [5]:
from langchain.vectorstores.elasticsearch import ElasticsearchStore

index_name = "elser_sotu_paragraphs"

elastic_vector_search = ElasticsearchStore(
    es_connection=es,
    index_name=index_name,
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy()
)

In [36]:
from tqdm import tqdm
import re
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

documents = []


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 800,
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = True,
)

for speech in tqdm(speeches):
    date = speech["date"]
    date_iso = speech["date_iso"]
    url = speech["url"]
    administration = speech["administration"]
    sotu_id = f"{administration}-{date_iso}"
    print(sotu_id)

    # if sotu_id != "Biden-2021-04-28": 
    #     continue
    
    text = speech["text"].strip()

    ## good for debugging
    # write_strings_to_file([text], "orig.txt")

    ## the speeches have page breaks that are surrounded by empty lines
    sections = text.split("\n\n")
    pattern = r'^\[\[Page ([A-Za-z0-9]+)\]\]$'
    for i, section in enumerate(sections):
        s = section.strip()
        if re.match(pattern, s):
            del sections[i]
    mergeSections = "\n".join(sections)
    

    ## paragraphs of speech end with a '.' or a '!'
    sections = re.split(r'[.!]\n', mergeSections)
    
    num_sections = len(sections)

    if num_sections < 5 :
        ## some of the documents are just a wall of text
        ## we'll parse them using a langchain chunker
        print(f"{sotu_id} has only {num_sections} sections")

        chunks = []
        for sec in sections:
            split_chunks = text_splitter.split_text(sec)
            # print(split_chunks)
            chunks.extend( split_chunks )
        
        for i, p in enumerate(chunks):
            doc = Document(
                page_content=p,
                metadata={
                    "chunk": i,
                    "date": date,
                    "date_iso": date_iso,
                    "url": url,
                    "administration": administration,
                    "sotu_id": sotu_id
                }
            )
            documents.append(doc)
        # merged = "\n".join(chunks)
        # write_strings_to_file([merged], f"{sotu_id}-scratch.txt")

    else:
        # get rid of the line breaks that are just formatting
        paragraphs = []
        for sec in sections:
            paragraphs.append( sec.replace("\n"," ").strip()  )

        ## We now have reasonably clean paragraphs

        ## create documents
        
        for i, p in enumerate(paragraphs):
            doc = Document(
                page_content=p,
                metadata={
                    "chunk": i,
                    "date": date,
                    "date_iso": date_iso,
                    "url": url,
                    "administration": administration,
                    "sotu_id": sotu_id
                }
            )
            documents.append(doc)
        # good for debugging
        # print(url)
        # write_strings_to_file(paragraphs, f"{sotu_id}-scratch.txt")

results = elastic_vector_search.add_documents(
    documents,
    bulk_kwargs={
        "chunk_size": 16,
        "max_chunk_bytes": 200000000
    }
)
elastic_vector_search.client.indices.refresh(index=index_name)


100%|██████████| 31/31 [00:00<00:00, 215.86it/s]


Biden-2023-02-07
Biden-2022-03-01
Biden-2022-03-01 has only 1 sections
Biden-2021-04-28
Biden-2021-04-28 has only 1 sections
Trump-2020-02-04
Trump-2020-02-04 has only 1 sections
Trump-2019-02-05
Trump-2019-02-05 has only 1 sections
Trump-2018-01-30
Trump-2018-01-30 has only 1 sections
Trump-2017-02-28
Trump-2017-02-28 has only 1 sections
Obama-2016-01-12
Obama-2016-01-12 has only 1 sections
Obama-2015-01-20
Obama-2015-01-20 has only 1 sections
Obama-2014-01-28
Obama-2014-01-28 has only 1 sections
Obama-2013-02-12
Obama-2013-02-12 has only 1 sections
Obama-2012-01-24
Obama-2012-01-24 has only 1 sections
Obama-2011-01-25
Obama-2011-01-25 has only 1 sections
Obama-2010-01-27
Obama-2010-01-27 has only 1 sections
Obama-2009-02-24
Obama-2009-02-24 has only 1 sections
Bush43-2008-02-04
Bush43-2007-01-29
Bush43-2006-02-06
Bush43-2005-02-07
Bush43-2004-01-26
Bush43-2003-02-03
Bush43-2002-02-04
Bush43-2001-03-05
Clinton-2000-01-31
Clinton-1999-01-25
Clinton-1998-02-02
Clinton-1997-02-10
Clinton

ObjectApiResponse({'_shards': {'total': 2, 'successful': 2, 'failed': 0}})