## Encode legal passages and create embeddings index

##### Prerequisites

In [2]:
%%capture 

!pip install PyYAML

#### Imports

In [4]:
from requests.auth import HTTPBasicAuth
from tqdm import tqdm
import requests
import logging 
import boto3
import yaml
import json
import os

##### Setup logging

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'Using requests=={requests.__version__}')
logger.info(f'Using pyyaml=={yaml.__version__}')

Using requests==2.31.0
Using pyyaml==6.0


#### Setup essentials

In [7]:
TEXT_EMBEDDING_MODEL_ENDPOINT_NAME = 'huggingface-textembedding-gpt-j-6b-fp16-1691075868'
CHUNKS_DIR_PATH = './data/chunks'

sagemaker_client = boto3.client('runtime.sagemaker')

In [8]:
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)

es_username = config['credentials']['username']
es_password = config['credentials']['password']

domain_endpoint = config['domain']['endpoint']
domain_index = config['domain']['index']

In [9]:
URL = f'{domain_endpoint}/{domain_index}'
logger.info(f'URL for OpenSearch index = {URL}')

URL for OpenSearch index = https://search-sematic-search-4vgtrb5lpgqsss26pxewnosnjy.eu-west-1.es.amazonaws.com/legal-passages


#### Define the index mapping with a k-NN vector field

In [10]:
mapping = {
    'settings': {
        'index': {
            'knn': True  # Enable k-NN search for this index
        }
    },
    'mappings': {
        'properties': {
            'embedding': {  # k-NN vector field
                'type': 'knn_vector',
                'dimension': 4096  # Dimension of the vector
            },
            'passage_id': {
                'type': 'long'
            },
            'passage': {
                'type': 'text'
            },
            'doc_id': {
                'type': 'keyword'
            }
        }
    }
}

#### Create the index with the specified mapping

In [None]:
# Check if the index exists using an HTTP HEAD request
response = requests.head(URL, auth=HTTPBasicAuth(es_username, es_password))

# If the index does not exist (status code 404), create the index
if response.status_code == 404:
    response = requests.put(URL, auth=HTTPBasicAuth(es_username, es_password), json=mapping)
    logger.info(f'Index created: {response.text}')
else:
    logger.error('Index already exists!' + str(response.status_code))

#### Encode passages (chunks) using JumpStart's GPT-J text embedding model and ingest to OpenSearch

In [11]:
def chunk_iterator(dir_path: str):
    for root, _, filenames in os.walk(dir_path):
        for filename in filenames:
            file_path = os.path.join(root, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r') as file:
                    file_contents = file.read()
                    yield filename, file_contents

In [12]:
%%time

i = 1
os_bulk_documents_and_index = ''
os_bulk_documents = []; 
os_doc_id = []; 
for chunk_name, chunk in tqdm(chunk_iterator(CHUNKS_DIR_PATH)):
    doc_id, chunk_id = chunk_name.split('_')
    payload = {'text_inputs': [chunk]}
    payload = json.dumps(payload).encode('utf-8')
    
    response = sagemaker_client.invoke_endpoint(EndpointName=TEXT_EMBEDDING_MODEL_ENDPOINT_NAME, 
                                                ContentType='application/json',  
                                                Body=payload)
    
    model_predictions = json.loads(response['Body'].read())
    embedding = model_predictions['embedding'][0]
    
    #embedding ='test'
   
    document = { 
        'doc_id': doc_id, 
        'passage_id': chunk_id,
        'passage': chunk, 
        'embedding': embedding}
    index= { "index": { "_id": i} }
    
    #For bulk insert
    os_doc_id.append(index);
    os_bulk_documents.append(document);
    os_bulk_documents_and_index = os_bulk_documents_and_index + json.dumps(index)+'\\n'+ json.dumps(document) + '\\n'
    i += 1
    #bulk end
    
    '''response = requests.post(f'{URL}/_doc/{i}', auth=HTTPBasicAuth(es_username, es_password), json=document)
    i += 1
    
    if response.status_code not in [200, 201]:
        logger.error(response.status_code)
        logger.error(response.text)
        break'''

404it [01:53,  3.57it/s]

CPU times: user 5.69 s, sys: 4.61 s, total: 10.3 s
Wall time: 1min 53s





In [34]:
os_bulk_documents_and_index1 = ''
i = 1
for doc in os_bulk_documents:
    index= { "index": { "_id": i} }
    os_bulk_documents_and_index1 = os_bulk_documents_and_index1 + f'{json.dumps(index)}\n{json.dumps(doc)}\n'
    i += 1
    

In [35]:
#print (os_bulk_documents_and_index)
#print (os_bulk_documents)
#print (json.dumps(os_doc_id));
length = len(os_bulk_documents_and_index1)
#substring = os_bulk_documents_and_index[length-10000:length]
substring = os_bulk_documents_and_index1[0:1000]
print (substring)


{"index": {"_id": 1}}
{"doc_id": "005", "passage_id": "57", "passage": " rely upon the European Convention on Human Rights.\nThis will mean that the Commission can bring test cases without the need for a victim to do so personally.\nI reject the arguments that the Commission is obliged to identify a victim and that it must demonstrate that an unlawful act has actually taken place before it may bring proceedings to challenge the compatibility of legislation with ECHR.\nHRA contemplates two distinct and complementary mechanisms for the protection of Convention rights challenges to legislation under sections 3 5 of the Act and challenges to the acts of public authorities under sections 6 9 per Lord Rodger in Wilson v First County Trust Ltd (No 2) [2004] 1 AC 816, para 206.\nThe title to sections 3 5 of the Act is legislation, and to sections 6 9 public authorities.\nThere is every reason to conclude that the availability of two different species of challenge was in the contemplation of th

In [36]:
#Bulk insert docs in openSearch Index
#pecifying the index in the path means you don’t need to include it in the request body.


response = requests.post(f'{URL}/_bulk', auth=HTTPBasicAuth(es_username, es_password), data=os_bulk_documents_and_index1, headers={'Content-Type': 'application/x-ndjson'})
 
if response.status_code not in [200, 201]:
    logger.error(response.status_code)
    logger.error(response.text)
   

In [39]:
response = requests.get(f'{URL}/_doc/1', auth=HTTPBasicAuth(es_username, es_password))

print (response.text)

{"_index":"legal-passages","_id":"1","_version":1,"_seq_no":0,"_primary_term":1,"found":true,"_source":{"doc_id": "005", "passage_id": "57", "passage": " rely upon the European Convention on Human Rights.\nThis will mean that the Commission can bring test cases without the need for a victim to do so personally.\nI reject the arguments that the Commission is obliged to identify a victim and that it must demonstrate that an unlawful act has actually taken place before it may bring proceedings to challenge the compatibility of legislation with ECHR.\nHRA contemplates two distinct and complementary mechanisms for the protection of Convention rights challenges to legislation under sections 3 5 of the Act and challenges to the acts of public authorities under sections 6 9 per Lord Rodger in Wilson v First County Trust Ltd (No 2) [2004] 1 AC 816, para 206.\nThe title to sections 3 5 of the Act is legislation, and to sections 6 9 public authorities.\nThere is every reason to conclude that the 