In [2]:
# ! pip install openai langchain unstructured pdf2image  tabulate libmagic num2words tiktoken

In [4]:
# ! pip install opensearch-py boto3

In [6]:
# ! pip install -U sentence-transformers

In [16]:
RAW_FOLDER = "rawdata/bg"
MODEL_NAME = 'all-MiniLM-L6-v2'

os_host = "localhost"
os_port = 9200
os_auth = ("admin", "admin")


In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(MODEL_NAME)

In [3]:
import re
if True:
    def custom_normalize_text(s, sep_token = " \n "):
        s = re.sub(r"\s+", ' ', s).strip()
        s = re.sub(r". ,","", s)
        
        # remove all the instances of multiple spaces
        s = s.replace("..", ".")
        s = s.replace(". .", ".")
        s = s.replace("\n", "")
        s = s.strip()    
        return s

In [4]:
docs = [
  {  "input": "The AI Ethics Conference 2024 brought together leading experts, policymakers, and practitioners to discuss the ethical implications of artificial intelligence and machine learning technologies. Topics included bias mitigation, transparency, accountability, and the societal impacts of AI."},
  {   "input": "The AI in Healthcare Summit showcased the latest advancements in artificial intelligence and its applications in the healthcare industry. Speakers presented innovative AI-driven solutions for disease diagnosis, personalized treatment plans, medical imaging analysis, and patient care optimization." },
  {  "input": "The AI Startup Pitch Competition provided a platform for emerging AI startups to showcase their innovative products and ideas to a panel of investors and industry experts. Participants presented AI-powered solutions spanning various domains, including finance, e-commerce, cybersecurity, and smart cities."}
]


In [5]:
len(docs)


3

In [6]:
docs

[{'input': 'The AI Ethics Conference 2024 brought together leading experts, policymakers, and practitioners to discuss the ethical implications of artificial intelligence and machine learning technologies. Topics included bias mitigation, transparency, accountability, and the societal impacts of AI.'},
 {'input': 'The AI in Healthcare Summit showcased the latest advancements in artificial intelligence and its applications in the healthcare industry. Speakers presented innovative AI-driven solutions for disease diagnosis, personalized treatment plans, medical imaging analysis, and patient care optimization.'},
 {'input': 'The AI Startup Pitch Competition provided a platform for emerging AI startups to showcase their innovative products and ideas to a panel of investors and industry experts. Participants presented AI-powered solutions spanning various domains, including finance, e-commerce, cybersecurity, and smart cities.'}]

In [7]:
documents = []
ids = []

for index, doc in enumerate(docs):
    print(doc)
    doc_content = custom_normalize_text(doc["input"])
    documents.append(doc_content)
    ids.append(index)
print("documents:", documents)

document_embeddings = model.encode(documents)

{'input': 'The AI Ethics Conference 2024 brought together leading experts, policymakers, and practitioners to discuss the ethical implications of artificial intelligence and machine learning technologies. Topics included bias mitigation, transparency, accountability, and the societal impacts of AI.'}
{'input': 'The AI in Healthcare Summit showcased the latest advancements in artificial intelligence and its applications in the healthcare industry. Speakers presented innovative AI-driven solutions for disease diagnosis, personalized treatment plans, medical imaging analysis, and patient care optimization.'}
{'input': 'The AI Startup Pitch Competition provided a platform for emerging AI startups to showcase their innovative products and ideas to a panel of investors and industry experts. Participants presented AI-powered solutions spanning various domains, including finance, e-commerce, cybersecurity, and smart cities.'}
documents: ['The AI Ethics Conference 2024 brought together leading 

In [8]:
document_embeddings

array([[-0.04231712,  0.04941342,  0.00344948, ...,  0.05364861,
        -0.0328605 , -0.08284619],
       [-0.02934069, -0.00590096,  0.00774114, ...,  0.03387276,
        -0.01205208, -0.05009643],
       [-0.02112303, -0.03839754,  0.02066777, ..., -0.0071109 ,
         0.03650481,  0.02511657]], dtype=float32)

In [9]:
print(len(document_embeddings))
print(len(document_embeddings[0]))

3
384


In [10]:
DERIVED_EMBEDDING_SIZE = len(document_embeddings[0])
DERIVED_EMBEDDING_SIZE

384

In [19]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth

os_client = OpenSearch(
    hosts=[{'host': os_host, 'port': os_port}],
    http_auth=os_auth,
    http_compress = True, # enables gzip compression for request bodies
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
 )
 

In [20]:
os_client.info()

{'name': 'opensearch-node1',
 'cluster_name': 'opensearch-cluster',
 'cluster_uuid': 'cJDQkhh2ROSIsK0-J6m1DA',
 'version': {'distribution': 'opensearch',
  'number': '2.13.0',
  'build_type': 'tar',
  'build_hash': '7ec678d1b7c87d6e779fdef94e33623e1f1e2647',
  'build_date': '2024-03-26T00:02:39.659767978Z',
  'build_snapshot': False,
  'lucene_version': '9.10.0',
  'minimum_wire_compatibility_version': '7.10.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

In [25]:
if False:
    index_name = "demo"
    response = os_client.indices.delete(index_name)

In [28]:
index_name='demo'
index_body={

  "settings": 
{
      "index": 
        {
          "knn": True,
          "knn.algo_param.ef_search": 100
        }
    },
  "mappings": 
    {
      "properties": 
        {
          "my_vector1": 
            {
              "type": "knn_vector",
              "dimension": 384,
              "method": 
                {
                  "name": "hnsw",
                  "space_type": "l2",
                  "engine": "nmslib",
                  "parameters": 
                    {
                      "ef_construction": 128,
                      "m": 24
                    }
                }
            } 
        }
    }
}


response = os_client.indices.create(index_name, body=index_body)
response

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'demo'}

In [30]:
for index, item in enumerate(document_embeddings):
    my_doc = {"id":index, "text": documents[index], "my_vector1": document_embeddings[index]}
    response = os_client.index(
        index = index_name,
        body = my_doc,
        id = str(index),
        refresh = True
    )

In [31]:
response

{'_index': 'demo',
 '_id': '2',
 '_version': 1,
 'result': 'created',
 'forced_refresh': True,
 '_shards': {'total': 2, 'successful': 2, 'failed': 0},
 '_seq_no': 2,
 '_primary_term': 1}

In [32]:
def get_embedding_for_user_query(user_query):
    user_embedding = model.encode([custom_normalize_text(user_query)])
    
    return (user_embedding[0])

user_query_embedding = get_embedding_for_user_query("When is the AI Ethics Conference 2024 taking place?")


In [47]:
query={
    "size":1,
    "query":{
        "knn":{
            "my_vector1":{
                    "vector":user_query_embedding,
                    "k":1
                    }
            }
        }
    }

response = os_client.search(body=query, index=index_name)
print("max score =", response['hits']['max_score'])
print(response['hits']['hits'][0]['_source']['text'])

max score = 0.6991057
The AI Ethics Conference 2024 brought together leading experts, policymakers, and practitioners to discuss the ethical implications of artificial intelligence and machine learning technologies. Topics included bias mitigation, transparency, accountability, and the societal impacts of AI.
