In [57]:
import sys

sys.path.append("../")

import cohere
import json

from pathlib import Path
from config import Paths, APIKeys 

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [58]:
handbook_merged = Path(f"{Paths.data}/handbook_merged")
full_filename = f"{handbook_merged}/handbook.json"

with open(full_filename, "r") as f:
    output_merged = json.load(f)

In [61]:
from functools import cache

model = SentenceTransformer('msmarco-MiniLM-L-12-v3', device="mps")

@cache
def get_embeddings(batch: tuple):
    return model.encode(list(batch))

In [68]:
vectors = []
vector_ids = []

batch_size = 512
batch = []
batch_ids = []

for i, elem in enumerate(output_merged):
    if i % 500 == 0:
        print(f"{i/len(output_merged) * 100: .2f}%")

    if len(elem["text"]) > 5:
        batch_ids.append(i)
        batch.append("\n".join(elem["text"]))

    if len(batch) >= batch_size:
        vectors.append(get_embeddings(tuple(batch)))
        vector_ids.append(batch_ids)
        batch = []
        batch_ids = []
        break

if len(batch) > 0:
    vectors.append(get_embeddings(tuple(batch)))
    vector_ids.append(batch_ids)
    batch = []
    batch_ids = []

 0.00%
 1.89%


In [86]:
flatten_vectors_ids = [item for sublist in vector_ids for item in sublist]
flatten_vectors = [item for sublist in vectors for item in sublist] 

assert len(flatten_vectors) == len(flatten_vectors_ids)
assert vector_ids[0][0] == flatten_vectors_ids[0]
assert vector_ids[-1][-1] == flatten_vectors_ids[-1]
assert (vectors[0][0] == flatten_vectors[0]).all()
assert (vectors[-1][-1] == flatten_vectors[-1]).all()

In [87]:
output_with_embeddings = output_merged.copy()

for i, vec in zip(flatten_vectors_ids, flatten_vectors):
    output_with_embeddings[i]["vector"] = vec

output_with_embeddings = [elem for elem in output_with_embeddings if "vector" in elem]

In [91]:
output_with_embeddings[0]

{'title': '"Auxiliary Solution Resource: Agile"',
 'path': '/handbook/marketing/brand-and-product-marketing/product-and-solution-marketing/usecase-gtm/agile',
 'section': 'Agile Planning and Management',
 'text': '\nBy empowering teams, embracing change, and focusing on delivering value, Agile methodologies have transformed software development. Agile teams create more relevant, valuable, customer-centric products, more quickly than ever.\nDevelopment teams accelerate the delivery of value with iterative, incremental, and lean project methodologies including Scrum, Kanban, Extreme Programming (XP), and more. Large enterprises have adopted Agile at enterprise scale through a variety of frameworks, including Scaled Agile Framework (SAFe), Spotify, Large Scale Scrum (LeSS), and others. GitLab enables teams to apply Agile practices and principles to organize and manage their work, whatever their chosen methodology. These new methodologies bring new challenges.\nAgile is not a single method

In [37]:
handbook_merged = Path(f"{Paths.data}/handbook_merged")
full_filename = f"{handbook_merged}/handbook_with_embeddings.json"

with open(full_filename, "w") as fw:
    json.dump(output_with_embeddings, fw)

In [88]:
len(output_with_embeddings)

512

In [94]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

query_text = "Impact of agile on the software development process"
embedding = get_embeddings((query_text,))[0] 

scores = cosine_similarity([embedding], flatten_vectors)[0] 
top_scores_ids = np.argsort(scores)[-30:][::-1]

In [95]:
# print top 10 results
for i in top_scores_ids:
    print(f"Score: {scores[i]}")
    print(f"Title: {output_with_embeddings[i]['title']}")
    print(f"{output_with_embeddings[i]['text']}")
    print("---")

Score: 0.10807240754365921
Title: "Talent Acquisition Process Framework"


Acquisitions
Candidate Experience Specialist Responsibilities
Candidate Management Processes
Evergreen Requisitions
Executive Search Process (E-Group Only)
Hiring Manager Processes
Job Offer Process
How to Complete a Contract - CES Process
Internal Hiring Process
People Technology & Insights Processes
Req Creation Process
Req Overview Processes
Triad Process


---
Score: 0.09306518733501434
Title: "Auxiliary Solution Resource: Agile"
Manage stage:
Manage stage:
Value Stream Analytics
Insights

Plan stage:
Plan stage:
Scoped Labels


4. Role-specific interfaces
Offers a variety of interfaces aimed at role-based contexts, including Security Dashboards, Roadmaps, Value Stream Analytics, Group- and Project-level boards, and aggregation of all relevant information in the Merge Request.
Manage stage: Value Stream AnalyticsPlan stage:Roadmaps, BoardsSecure stage: Security Dashboard.


4. Role-specific interfaces
Offers