# Loading Data into the Elasticsearch 
We are going to use python to read the data and load that into *elasticsearch*. We need the the *elasticsearch* python package installed to do that. 

In [1]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from pprint import pprint


## Connecting to Elasticsearch 
es = Elasticsearch( "https://elk-single-node:9200", 
    ca_certs="/home/vagrant/data/elasticsearch/certs/ca/ca.crt", 
    basic_auth=('elastic','Ucsc@1234')
)

model = SentenceTransformer('all-MiniLM-L6-v2')

print( es.info() )

  from .autonotebook import tqdm as notebook_tqdm


{'name': 'elk-single-node', 'cluster_name': 'elk-single-node', 'cluster_uuid': 'Gkdt5h9RSiC2fvIFhQUS7A', 'version': {'number': '8.11.1', 'build_flavor': 'default', 'build_type': 'deb', 'build_hash': '6f9ff581fbcde658e6f69d6ce03050f060d1fd0c', 'build_date': '2023-11-11T10:05:59.421038163Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


Let us now define the functions which we need to handle this operation. 

In [8]:
def create_index():
    es.indices.delete(index='my_documents-v2', ignore_unavailable=True)
    es.indices.create(index='my_documents-v2', mappings={
            'properties': {
                'embedding': {
                    'type': 'dense_vector',
                }
            }
        }
    )

def insert_document(document):
    return es.index(index='my_documents-v2', document={
            **document,
            'embedding': model.encode(document['summary']),
        })

def insert_documents(documents):
    operations = []
    for document in documents:
            operations.append({'index': {'_index': 'my_documents-v2'}})
            operations.append({
                **document,
                'embedding': model.encode(document['summary']),
            })
    return es.bulk(operations=operations)

def reindex():
    import json

    create_index()
    with open('data.json', 'rt') as f:
        documents = json.loads(f.read())
    return insert_documents(documents)

def search(**query_args):
    return es.search(index='my_documents-v2', **query_args)

def retrieve_document(id):
    return es.get(index='my_documents-v2', id=id)


Let us now use the reindex function to create an index and load the data

In [9]:
reindex()

ObjectApiResponse({'errors': False, 'took': 139, 'items': [{'index': {'_index': 'my_documents-v2', '_id': 'IXLEW4wBBwUzeYz54M8L', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_documents-v2', '_id': 'InLEW4wBBwUzeYz54M8L', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_documents-v2', '_id': 'I3LEW4wBBwUzeYz54M8L', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_documents-v2', '_id': 'JHLEW4wBBwUzeYz54M8L', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'my_documents-v2', '_id': 'JXLEW4wBBwUzeYz54M8L', '_version': 1, 're

Let us now try to search for items in the elasicsearch database. 

In [25]:
def extract_filters(query):
    import re 

    filter_regex = r'category:([^\s]+)\s*'
    m = re.search(filter_regex, query)
    if m is None:
        return {}, query  # no filters
    
    filters = {
        'filter': [{
            'term': {
                'category.keyword': {
                    'value': m.group(1)
                }
            }
        }]
    }
    
    query = re.sub(filter_regex, '', query).strip()
    
    return filters, query

def handle_search(query):
    from pprint import pprint
    
    filters, parsed_query = extract_filters(query)
    

    if parsed_query:
        search_query = {
            'must': {
                'multi_match': {
                    'query': parsed_query,
                    'fields': ['name', 'summary', 'content'],
                }
            }
        }
    else:
        search_query = {
            'must': {
                'match_all': {}
            }
        }

    results = search(
        query={
            'bool': {
                **search_query,
                **filters
            }
        },
        knn={
            'field': 'embedding',
            'query_vector': model.encode(parsed_query),
            'k': 10,
            'num_candidates': 50,
            **filters,
        },
        rank={
            'rrf': {}
        },
        size=5,
        
    )

    #results=results['hits']['hits']
    #total=results['hits']['total']['value']
    #query=query

    print('Searching for Below Query: ')
    print(query)
    print(f'We got hits in the database!')
    pprint(results)
    print('Results are given below...')
    #pprint(total)



In [26]:
query = 'Working from home'

handle_search(query=query)


Searching for Below Query: 
Working from home
We got hits in the database!
ObjectApiResponse({'took': 6, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 13, 'relation': 'eq'}, 'max_score': None, 'hits': [{'_index': 'my_documents-v2', '_id': 'IXLEW4wBBwUzeYz54M8L', '_score': None, '_rank': 1, '_ignored': ['content.keyword', 'summary.keyword'], '_source': {'content': "Effective: March 2020\nPurpose\n\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\nScope\n\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n