In [209]:
import sys

sys.path.append("../")

import cohere
import json

from pathlib import Path
from config import Paths, APIKeys 

from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity

In [210]:
handbook_merged = Path(f"{Paths.data}/handbook_merged")
full_filename = f"{handbook_merged}/handbook.json"

with open(full_filename, "r") as f:
    output_merged = json.load(f)

In [211]:
from functools import cache

model = SentenceTransformer('msmarco-MiniLM-L-12-v3', device="mps")

co = cohere.Client(APIKeys.cohere)

@cache
def get_embeddings(batch: tuple):
    response = co.embed(
        texts=batch,
    ) 
    return response.embeddings 

In [212]:
vectors = []
vector_ids = []

batch_size = 96 
batch = []
batch_ids = []

for i, elem in enumerate(output_merged):
    if i % 500 == 0:
        print(f"{i/len(output_merged) * 100: .2f}%")

    if len(elem["text"]) > 5:
        batch_ids.append(i)
        batch.append(elem["text"])

    if len(batch) >= batch_size:
        vectors.append(get_embeddings(tuple(batch)))
        vector_ids.append(batch_ids)
        batch = []
        batch_ids = []

if len(batch) > 0:
    vectors.append(get_embeddings(tuple(batch)))
    vector_ids.append(batch_ids)
    batch = []
    batch_ids = []

 0.00%
 1.89%
 3.78%
 5.66%
 7.55%
 9.44%
 11.33%
 13.21%
 15.10%
 16.99%
 18.88%
 20.76%
 22.65%
 24.54%
 26.43%
 28.31%
 30.20%
 32.09%
 33.98%
 35.87%
 37.75%
 39.64%
 41.53%
 43.42%
 45.30%
 47.19%
 49.08%
 50.97%
 52.85%
 54.74%
 56.63%
 58.52%
 60.40%
 62.29%
 64.18%
 66.07%
 67.96%
 69.84%
 71.73%
 73.62%
 75.51%
 77.39%
 79.28%
 81.17%
 83.06%
 84.94%
 86.83%
 88.72%
 90.61%
 92.49%
 94.38%
 96.27%
 98.16%


In [213]:
flatten_vectors_ids = [item for sublist in vector_ids for item in sublist]
flatten_vectors = [item for sublist in vectors for item in sublist] 

assert len(flatten_vectors) == len(flatten_vectors_ids)
assert vector_ids[0][0] == flatten_vectors_ids[0]
assert vector_ids[-1][-1] == flatten_vectors_ids[-1]
# assert (vectors[0][0] == flatten_vectors[0]).all()
# assert (vectors[-1][-1] == flatten_vectors[-1]).all()

In [214]:
output_with_embeddings = output_merged.copy()

for i, vec in zip(flatten_vectors_ids, flatten_vectors):
    output_with_embeddings[i]["vector"] = vec

output_with_embeddings = [elem for elem in output_with_embeddings if "vector" in elem]

assert (
    cosine_similarity(
        [get_embeddings(tuple([output_merged[0]["text"]]))[0]],
        [output_with_embeddings[0]["vector"]],
    )[0][0]
    > 0.99999
)

In [215]:
handbook_merged = Path(f"{Paths.data}/handbook_merged")
full_filename = f"{handbook_merged}/handbook_with_embeddings.json"

with open(full_filename, "w") as fw:
    json.dump(output_with_embeddings, fw)

In [258]:
import numpy as np

query_text = "How many vacation days do I get?"
embedding = get_embeddings((query_text,))[0] 

scores = cosine_similarity([embedding], flatten_vectors)[0] 
top_scores_ids = np.argsort(scores)[-30:][::-1]

In [262]:
from sentence_transformers import CrossEncoder

model_ce = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2", device="mps")
corpus = [(output_with_embeddings[i]["text"], output_with_embeddings[i]["title"]) for i in top_scores_ids]
sentence_combinations = [[query_text, text] for text, title in corpus]

similarity_scores = model_ce.predict(sentence_combinations)
sim_scores_argsort = reversed(np.argsort(similarity_scores))

In [263]:
for i, idx in enumerate(sim_scores_argsort, start=1):
    text, title = corpus[idx]
    print(f"{i}. {title} ({similarity_scores[idx]:.2f})")
    print(f"{text}")
    if i > 4:
        break

1. "GitLab LTD (UK) Benefits" (5.58)
Statutory Vacation Leave
Team members are entitled to at least 28 vacation days which consist of 20 days of standard annual leave plus 8 public holidays. These days will accrue from the team member’s start date.

Entitlement to vacation leave is pro rata for any part-time team members based on hours worked.
It is important for UK team members to utilize time off for public holidays as the total amount of annual leave is accrued here inclusive of public holidays.
Team members must designate any vacation time taken as Vacation in PTO by Deel to ensure that vacation entitlement is properly tracked.
Under normal circumstances, there is no carryover for unused vacation days, with exceptions where a team member cannot take annual leave due to sickness or maternity leave; however, given the situation surrounding COVID-19, the UK government has relaxed measures on the carry over of annual leave for the following two calendar years: 2020-2021 and 2021-2022.


In [232]:
# print top 10 results
for i in top_scores_ids:
    print(f"Score: {scores[i]}")
    print(f"Title: {output_with_embeddings[i]['title']}")
    # print(f"{output_with_embeddings[i]['text']}")
    print("---")

Score: 0.5644156453540482
Title: Global Benefits Survey
---
Score: 0.5369252081613023
Title: Global Benefits Survey
---
Score: 0.5344941143131154
Title: "Equity Compensation"
---
Score: 0.5307654908566717
Title: Global Benefits Survey
---
Score: 0.5271772157015868
Title: "Interviewing at GitLab - Customer Support"
---
Score: 0.52033062071507
Title: "Dev Career Framework"
---
Score: 0.5195088159842348
Title: General & Entity Specific Benefits & Information
---
Score: 0.5090301117623416
Title: "Hiring"
---
Score: 0.5045339882940838
Title: "Engineering Career Development"
---
Score: 0.5009925827224038
Title: General & Entity Specific Benefits & Information
---
Score: 0.49952712364937935
Title: None
---
Score: 0.49312540109098213
Title: Manage Stage
---
Score: 0.49196924048525636
Title: Global Benefits Survey
---
Score: 0.4908798249387107
Title: Global Benefits Survey
---
Score: 0.48488581962097976
Title: "Engineering IC Leadership"
---
Score: 0.4846556904654701
Title: Manager Notes
---
Sc