# Prepare environment

## Init base libs

* Install libs

In [None]:
%%capture
%pip install \
    python-dotenv \
    pyyaml

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
import yaml

def read_yaml_file(file_path):
    """
    Reads YAML file.
    """
    with open(file_path, "r") as file:
        config = yaml.safe_load(file)
    return config
    
# config = read_yaml_file("./config.yml")
# config["cloud_id"], api_key=config["api_key"]

In [None]:
import json

def read_json_file(file_path):
    """
    Reads and loads a JSON file.
    """
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

## Init ElasticSearch

In [None]:
%%capture
%pip install \
    elasticsearch

In [None]:
import elasticsearch
from elasticsearch import Elasticsearch, helpers

def get_client_es(hosts:str="http://elasticsearch:9200", max_retries:int=5):
    """
    Initializes Elasticsearch client using cloud_id and api_key from config.yml
    """
    es = Elasticsearch(hosts=hosts)
    return es.options(max_retries=max_retries)

es = get_client_es()
es.info() # should return cluster info

In [None]:
# https://www.elastic.co/search-labs/tutorials/search-tutorial/full-text-search/create-index
def create_index(index_name: str, mappings):
    es = get_client_es()
    if not es.indices.exists(index=index_name):
        response = es.indices.create(index=index_name, body=mappings)
        if response.meta.status != 200:
            raise RuntimeError("failed to create index")
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Index '{index_name}' already exists.")
        response = es.indices.get(index=index_name)
    return response

In [None]:
def count_index(index_name: str) -> int:
    es = get_client_es()
    count = int(es.cat.count(index=index_name, format="json")[0]["count"])
    return count

In [None]:
def search_index(index_name: str, body):
    es = get_client_es()
    response = es.search(index=index_name, body=body)
    return response

## Init Transformers

In [None]:
%%capture
%pip install \
    transformers \
    sentence-transformers

### Semantic Search

#### Sentence transformer

https://sbert.net/docs/quickstart.html

In [None]:
from sentence_transformers import SentenceTransformer

def get_text_vector(sentences, model_name="sentence-transformers/all-MiniLM-L6-v2", device_type="cpu"):
    """
    Generates sentence embeddings using pre-trained model.
    """
    # 1. Load a pretrained Sentence Transformer model
    model = SentenceTransformer(model_name, device=device_type)
    # 2. Calculate embeddings by calling model.encode()
    embeddings = model.encode(sentences)
    return embeddings

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]
embeddings = get_text_vector(sentences)
print(embeddings.shape)

## Loading dataset

https://huggingface.co/docs/datasets/en/quickstart

In [None]:
%%capture
%pip install \
    datasets

In [None]:
from datasets import load_dataset

# https://huggingface.co/datasets/quora
dataset = load_dataset('quora')
dataset

In [None]:
questions = []

for record in dataset['questions']:
    questions.extend(record['text'])

questions = list(set(questions))
print('\n'.join(questions[:3]))
print(len(questions))

# Semantic Search

## Sentence transformer

https://sbert.net/docs/quickstart.html

* max_seq_length : max # of tokens encoded into a single vector embedding. Beyond is truncated
* word_embedding_dimension : # of dimensionality of vector
* Normalize : final step is normalization

In [None]:
# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]
embeddings = get_text_vector(sentences)
print(embeddings.shape)
# [3, 384]

# => Calculate the embedding similarities
# similarities = model.similarity(embeddings, embeddings)
# print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])