Similarity search

In [None]:
import random
import torch
import numpy as np
import pandas as pd
device = "cuda" if torch.cuda.is_available() else "cpu"
#import texts and embedding df
text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.cs
v")
#convert embedding column back to np.array
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["e
mbedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))
#convert the embeddings to torch.tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df["embedding"]
.tolist(), axis=0), dtype=torch.float32).to(device)
#convert texts and embedding df to list of dicts
files_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")
text_chunks_and_embeddings_df

In [None]:
embeddings.shape

In [None]:
#create model
from sentence_transformers import util, SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
device=device)

In [None]:
#1. define query
query = "merge request"
print(f"Query: {query}")
#2. embed the query WITH SAME MODEL AS TEXT EMBEDDED WITH
query_embedding = embedding_model.encode(query, convert_to_tensor=True)
#get dot product similarity score
from time import perf_counter as timer
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()
print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end
_time-start_time:.5f} seconds.")
#4. get top-k results (top 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
files_and_chunks[98]

In [None]:
#displaying the vector search result nicer
import textwrap
def print_wrapped(text, wrap_lenght=80):
wrapped_text = textwrap.fill(text, wrap_lenght)
print(wrapped_text)

In [None]:
query = "merge request"
print(f"Query: '{query}'\n")
print("Results:")
#loop through zipped together scores and indices from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1])
:
print(f"Score: {score:.4f}")
print("Text:")
print_wrapped(files_and_chunks[idx]["sentence_chunk"])
print(f"File name: {files_and_chunks[idx]['filename']}")
print("\n")

In [None]:
from IPython.display import display, Markdown
file_path = 'content_pull_request/approving-a-pull-request-with-required-revi
ews.md'
with open(file_path, 'r', encoding='utf-8') as file:
markdown_text = file.read()
display(Markdown(markdown_text))

Vector similarity

Functionizing the semantic search pipeline

In [None]:
def retrieve_relevant_resources(query: str,
embeddings: torch.tensor,
model: SentenceTransformer=embedding_model,
n_resources_to_return: int=5,
print_time: bool=True):
# embedding the query with model and returning top k scores and indices f
rom the embedding:
query_embedding = model.encode(query, convert_to_tensor=True)
#dot product
start_time = timer()
dot_scores = util.dot_score(query_embedding, embeddings)[0]
end_time = timer()
if print_time:
print(f"[INFO] Time taken to get scores on ({len(embeddings)}) embedd
ings: {end_time-start_time:.5f} seconds.")
scores, indices = torch.topk(input=dot_scores,
k=n_resources_to_return)
return scores, indices
def print_top_results_and_scores(query: str,
embeddings: torch.tensor,
files_and_chunks: list[dict]=files_and_chunks
,
n_resources_to_return: int=5):
#finds relevant paragraphs for query and prints them with their scores
scores, indices = retrieve_relevant_resources(query=query,
embeddings=embeddings,
n_resources_to_return=n_resou
rces_to_return)
#loop through zipped together scores and indices from torch.topk
for score, idx in zip(scores, indices):
print(f"Score: {score:.4f}")
print("Text:")
print_wrapped(files_and_chunks[idx]["sentence_chunk"])
print(f"File name: {files_and_chunks[idx]['filename']}")
print("\n")

In [None]:
query="merge comments"
#retrieve_relevant_resources(query=query, embeddings=embeddings)
print_top_results_and_scores(query=query, embeddings=embeddings)

Local generative LLM

In [None]:
import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

In [None]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Load the LLM locally

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
#quantization config
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_compute_dtype=torch.
float16)
attn_implementation = "sdpa" #scaled dot product attention
#select model to use (using local as not able to login to huggingface CLI
model_directory = "C:/Users/Márk/google_gemma"
#tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_directory)
#instantiate LLM
llm_model = AutoModelForCausalLM.from_pretrained(model_directory,
torch_dtype=torch.float16,
quantization_config=quantiza
tion_config if quantization_config else None,
low_cpu_mem_usage=False,
attn_implementation=attn_imp
lementation)
if not quantization_config:
llm_model.to("cuda")

In [None]:
llm_model

In [None]:
#get the number of parameters of the model gemma-2b-it
def get_model_num_params(model: torch.nn.Module):
return sum([param.numel() for param in model.parameters()])
get_model_num_params(llm_model)

Generate text with the LLM

In [None]:
input_text = "How to comment on a pull request?"
print(f"Input text:\n{input_text}")
#create prompt template
dialogue_template = [
{"role": "user",
"content": input_text}
]
#apply template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
tokenize=False,
add_generation_prompt=True)
print(f"\nPrompt (formatted): \n{prompt}")

Tokenization

In [None]:
tokenizer

In [None]:
%%time
#tokenize prompt and send it to the GPU
input_ids = tokenizer(prompt,
return_tensors="pt").to("cuda")
#generate output for the LLM
outputs = llm_model.generate(**input_ids,
max_new_tokens=256)
print(f"Model output (tokens):\n{outputs[0]}\n")

In [None]:
#convert the output tokens back to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

In [None]:
#question list to test the model
question_list = ["What is a pull request in GitHub, and how does it contribut
e to the collaborative development process?",
"Can you explain the steps to resolve merge conflicts in a G
itHub pull request?",
"What are the best practices for reviewing a pull request on
GitHub?",
"How can you link a pull request to an issue in GitHub to au
tomate issue closure upon merging?",
"What are some common mistakes to avoid when creating a pull
request on GitHub?"
]

In [None]:
import random
query = random.choice(question_list)
print(f"Query: {query}")
#see the scores of top related results
scores, indices = retrieve_relevant_resources(query=query,
embeddings=embeddings)
scores, indices

Augmenting our prompt with context items

In [None]:
def prompt_formatter(query: str,
context_items: list[dict]) -> str:
context =
"
-
" + "\n-
".join([item["sentence_chunk"] for item in context_
items])
base_prompt = f"""Based on the following context items, please answer the
query.
The answers should be explanatory and comprehensive.
Query: {query}
Context items:
{context}
Relevant parts: <extract relevant passages from the context>
Answer:
"""
base_prompt = base_prompt.format(context=context,
query=query)
#prompt template for it model
dialogue_template = [
{"role": "user",
"content": base_prompt}
]
#apply template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
tokenize=False,
add_generation_prompt=True)
return base_prompt

In [None]:
query = random.choice(question_list)
print(f"Query: {query}")
scores, induces = retrieve_relevant_resources(query=query,
embeddings=embeddings)
context_items = [files_and_chunks[i] for i in indices]
prompt = prompt_formatter(query=query,
context_items=context_items)
print(prompt)

In [None]:
%%time
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = llm_model.generate(**input_ids,
temperature=0.7, #higher the temperature, the mo
re creative the answer is
do_sample=True,
max_new_tokens=256)
output_text = tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Create a function about the answer of the LLM

In [None]:
def ask(query: str,
temperature: float=0.5,
max_new_tokens: int=256,
format_answer_text=True,
detailed_output=True):
#RETRIEVAL
#get scores and indices of top results
scores, indices = retrieve_relevant_resources(query=query,
embeddings=embeddings)
#create context items list
context_items = [files_and_chunks[i] for i in indices]
#add score to item
for i, item in enumerate(context_items):
item["score"] = scores[i].cpu()
if not detailed_output:
item.pop("embedding", None) #remove embeddings if detailed_output
=False
#AUGMENTATION
#create the propmt and format it
prompt = prompt_formatter(query=query,
context_items=context_items)
#GENERATION
#tokenize prompt
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
#generate output tokens
outputs = llm_model.generate(**input_ids,
temperature=temperature,
do_sample=True,
max_new_tokens=max_new_tokens)
#decode tokens back to text
output_text = tokenizer.decode(outputs[0]).replace("<bos>",
'').replace("
<eos>",
'')
#prettyfy answer
if format_answer_text:
print(f"RAG answer:\n{output_text.replace(prompt, '')}")
if detailed_output:
return context_items

In [None]:
query = random.choice(question_list)
print(f"Query: {query}")
ask(query=query)

Testing:

Short form in the question

In [None]:
give_query = "How to merge a PR?"
if give_query == " ":
query = random.choice(question_list)
else:
query = give_query
print(f"Query: {query}")
ask(query=query)

Irrelevant question

In [None]:
give_query = "How long does a koala live?"
if give_query == " ":
query = random.choice(question_list)
else:
query = give_query
print(f"Query: {query}")
ask(query=query)