# Context

Has all the local BERT & integration with ES. TBD :

* using the new `dense_vector` format
* tied with the query using search(.., knn=..)
* can use [search_template](https://www.elastic.co/docs/solutions/search/search-templates)
* create an unbox func for hits hits

See other notebook for more : `Semantic - ElasticSearch with OpenAI.ipynb`

# Prepare environment

In [None]:
%load_ext autoreload
%autoreload 2

## Init base libs

* Install libs

In [None]:
%%capture
%pip install \
    python-dotenv \
    pyyaml

In [None]:
from libs.os import OsUtil

os_util = OsUtil()
result = os_util.get_env("PATH")
print(result)

In [None]:
import yaml

def read_yaml_file(file_path):
    """
    Reads YAML file.
    """
    with open(file_path, "r") as file:
        config = yaml.safe_load(file)
    return config
    
# config = read_yaml_file("./config.yml")
# config["cloud_id"], api_key=config["api_key"]

In [None]:
import json

def read_json_file(file_path):
    """
    Reads and loads a JSON file.
    """
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

## Init ElasticSearch

In [None]:
%%capture
%pip install \
    elasticsearch==8.15.1

In [None]:
from libs.elasticsearch import ElasticsearchClient

es_client = ElasticsearchClient()
print("Ping:", es_client.ping())
print("ES Info:", es_client.info()) # should return cluster info

In [None]:
def search_index(index_name: str, body):
    es = es_client.get_client()
    response = es.search(index=index_name, body=body)
    return response

def search_knn_index(index_name: str, field_name:str, query):
    es = es_client.get_client()
    query = {
        "field": field_name,
        "query_vector": query,
        "knn": 2,
        "num_candidates": 500
    }
    # , source=[]
    response = es.knn_search(index=index_name, knn=query)
    return response

## Init Transformers

In [None]:
%%capture
%pip install \
    transformers \
    sentence-transformers

### Semantic Search

#### Sentence transformer

* Select any [model](https://sbert.net/docs/sentence_transformer/pretrained_models.html#original-models) by performance
* https://sbert.net/docs/quickstart.html

In [None]:
from sentence_transformers import SentenceTransformer

class Tokenizer(object):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", device_type="cpu"):
        # 1. Load a pretrained Sentence Transformer model
        self.model = SentenceTransformer(model_name, device=device_type)

    def get_text_vector(self, sentences:list):
        # 2. Calculate embeddings by calling model.encode()
        sentence_embeddings = self.model.encode(sentences)
        return sentence_embeddings

    def get_tokens(self, documents) -> list :
        sentences = [documents]
        return list(get_text_vector(sentences).flatten())

def get_text_vector(sentences:list, model_name="sentence-transformers/all-MiniLM-L6-v2", device_type="cpu"):
    """
    Generates sentence embeddings using pre-trained model.
    """
    return Tokenizer().get_text_vector(sentences)

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]
embeddings = get_text_vector(sentences)
print(embeddings.shape)

## Loading dataset from Kaggle

In [None]:
%%capture
%pip install \
    opendatasets

In [None]:
import opendatasets as od 

od.download("https://www.kaggle.com/datasets/madhab/jobposts")

In [None]:
import pandas as pd

df = pd.read_csv('jobposts/data job posts.csv')
df.head(1)

In [None]:
import numpy as np

sampleDf = df.iloc[np.random.permutation(len(df))]
sampleDf

In [None]:
len(list(sampleDf.head(500)["Title"].unique()))

In [None]:
tokenizer = Tokenizer()

sentence_embedding = tokenizer.get_tokens("Software Engineer")
sentence_embedding

In [None]:
dataset = df.head(5)
dataset['vector'] = dataset['jobpost'].apply(tokenizer.get_tokens)
dataset

In [None]:
def wrapper(sentence_embeddings):
    encoded_np_array = np.array(sentence_embeddings)
    encoded_list = encoded_np_array.tolist()
    return encoded_list

dataset['vector'] = dataset['vector'].apply(wrapper)

In [None]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search-api.html

index_name = "job-posts"
mapping = {
  "settings": {
    "index": {
      "number_of_shards" :20,
      "number_of_replicas": 1,
      "knn":{
        "algo_param":{
          "ef_search":40,
          "ef_construction":40,
          "m": "4"
        }
      }
    },
    "knn": "true"
  }, 
  "mappings": {
    "properties": {
      "id": {"type": "keyword"},
      "jobposts": {
        "type": "text"
      },
      "vector": {
        "type": "knn_vector",
        "dimension": 384
      }
    }
  }
}
es_client.create_index(index_name, mapping)

## Loading dataset

https://huggingface.co/docs/datasets/en/quickstart

In [None]:
%%capture
%pip install \
    datasets

In [None]:
from datasets import load_dataset

# https://huggingface.co/datasets/quora
dataset = load_dataset('quora')
dataset

In [None]:
questions = []

for record in dataset['questions']:
    questions.extend(record['text'])

questions = list(set(questions))
print('\n'.join(questions[:3]))
print(len(questions))

# Semantic Search

## Sentence transformer

https://sbert.net/docs/quickstart.html

* max_seq_length : max # of tokens encoded into a single vector embedding. Beyond is truncated
* word_embedding_dimension : # of dimensionality of vector
* Normalize : final step is normalization

In [None]:
# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]
embeddings = get_text_vector(sentences)
print(embeddings.shape)
# [3, 384]

# => Calculate the embedding similarities
# similarities = model.similarity(embeddings, embeddings)
# print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])