In [1]:
import sys

sys.path.append("../")

import cohere
import json

import numpy as np

from functools import cache
from pathlib import Path
from config import Paths, CohereConfig 

from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import CrossEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
handbook_merged = Path(f"{Paths.data}/handbook_merged")
full_filename = f"{handbook_merged}/handbook.json"

with open(full_filename, "r") as f:
    output_merged = json.load(f)

In [3]:
co = cohere.Client(CohereConfig.api_key)

@cache
def get_embeddings(batch: tuple):
    response = co.embed(
        texts=batch,
    ) 
    return response.embeddings 

In [4]:
vectors = []
vector_ids = []

batch_size = 96 
batch = []
batch_ids = []

for i, elem in enumerate(output_merged):
    if i % 500 == 0:
        print(f"{i/len(output_merged) * 100: .2f}%")

    if len(elem["text"]) > 5:
        batch_ids.append(i)
        batch.append(elem["text"])

    if len(batch) >= batch_size:
        vectors.append(get_embeddings(tuple(batch)))
        vector_ids.append(batch_ids)
        batch = []
        batch_ids = []

if len(batch) > 0:
    vectors.append(get_embeddings(tuple(batch)))
    vector_ids.append(batch_ids)
    batch = []
    batch_ids = []

 0.00%
 1.83%
 3.66%
 5.49%
 7.33%
 9.16%
 10.99%
 12.82%
 14.65%
 16.48%
 18.32%
 20.15%
 21.98%
 23.81%
 25.64%
 27.47%
 29.30%
 31.14%
 32.97%
 34.80%
 36.63%
 38.46%
 40.29%
 42.12%
 43.96%
 45.79%
 47.62%
 49.45%
 51.28%
 53.11%
 54.95%
 56.78%
 58.61%
 60.44%
 62.27%
 64.10%
 65.93%
 67.77%
 69.60%
 71.43%
 73.26%
 75.09%
 76.92%
 78.75%
 80.59%
 82.42%
 84.25%
 86.08%
 87.91%
 89.74%
 91.58%
 93.41%
 95.24%
 97.07%
 98.90%


In [6]:
flatten_vectors_ids = [item for sublist in vector_ids for item in sublist]
flatten_vectors = [item for sublist in vectors for item in sublist] 

assert len(flatten_vectors) == len(flatten_vectors_ids)
assert vector_ids[0][0] == flatten_vectors_ids[0]
assert vector_ids[-1][-1] == flatten_vectors_ids[-1]
# assert (vectors[0][0] == flatten_vectors[0]).all()
# assert (vectors[-1][-1] == flatten_vectors[-1]).all()

In [7]:
len(flatten_vectors_ids), len(flatten_vectors)

(26384, 26384)

In [8]:
output_with_embeddings = output_merged.copy()

for i, vec in zip(flatten_vectors_ids, flatten_vectors):
    output_with_embeddings[i]["vector"] = vec

output_with_embeddings = [elem for elem in output_with_embeddings if "vector" in elem]

assert (
    cosine_similarity(
        [get_embeddings(tuple([output_merged[0]["text"]]))[0]],
        [output_with_embeddings[0]["vector"]],
    )[0][0]
    > 0.99999
)

In [9]:
handbook_merged = Path(f"{Paths.data}/handbook_merged")
full_filename = f"{handbook_merged}/handbook_with_embeddings.json"

with open(full_filename, "w") as fw:
    json.dump(output_with_embeddings, fw)

In [None]:
query_text = "How many vacation days do I get?"
embedding = get_embeddings((query_text,))[0] 

scores = cosine_similarity([embedding], flatten_vectors)[0] 
top_scores_ids = np.argsort(scores)[-30:][::-1]

In [None]:
model_ce = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2", device="mps")
corpus = [(output_with_embeddings[i]["text"], output_with_embeddings[i]["title"]) for i in top_scores_ids]
sentence_combinations = [[query_text, text] for text, title in corpus]

similarity_scores = model_ce.predict(sentence_combinations)
sim_scores_argsort = reversed(np.argsort(similarity_scores))

In [None]:
for i, idx in enumerate(sim_scores_argsort, start=1):
    text, title = corpus[idx]
    print(f"{i}. {title} ({similarity_scores[idx]:.2f})")
    print(f"{text}")
    if i > 4:
        break

1. "GitLab LTD (UK) Benefits" (5.58)
Statutory Vacation Leave
Team members are entitled to at least 28 vacation days which consist of 20 days of standard annual leave plus 8 public holidays. These days will accrue from the team member’s start date.

Entitlement to vacation leave is pro rata for any part-time team members based on hours worked.
It is important for UK team members to utilize time off for public holidays as the total amount of annual leave is accrued here inclusive of public holidays.
Team members must designate any vacation time taken as Vacation in PTO by Deel to ensure that vacation entitlement is properly tracked.
Under normal circumstances, there is no carryover for unused vacation days, with exceptions where a team member cannot take annual leave due to sickness or maternity leave; however, given the situation surrounding COVID-19, the UK government has relaxed measures on the carry over of annual leave for the following two calendar years: 2020-2021 and 2021-2022.
