In [None]:
import fitz

def load_pdf(pdf_file_path):
    contents = []
    doc = fitz.open(pdf_file_path)

    for page in doc:
        content = page.get_text()
        contents.append("\n" + content)

    return "\n".join(contents)

In [None]:
docs = load_pdf("C:/Users/gaurav/Downloads/resume.pdf")

In [None]:
def chunk_documents(text, chunk_size, overlap):



    split_lists = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        split_lists.append(text[start:end])
        start += chunk_size - overlap

    return split_lists

In [None]:
chunks = chunk_documents(docs, 500, 50)

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

# Torch save vectorstore -> Compute embeddings for document, Embedding Dict, Save Dict as Torch file

def store_embeddings_as_dict(embeddings_list, text_list):
    embeddings_dict = {text: embedding for text, embedding in zip(text_list, embeddings_list)}
    return embeddings_dict

def save_embeddings_to_pt(embeddings_dict, filename):
    torch.save(embeddings_dict, filename)

def create_vectorstore(embed_model, chunks):
    chunk_embeddings = []
    embedder = SentenceTransformer(embed_model)
    for chunk in tqdm(chunks, desc="TestCases Embeddings..."):
        chunk_embedding = embedder.encode(chunk, convert_to_tensor=True).to("cpu")
        chunk_embeddings.append(chunk_embedding)
    embeddings_dict = store_embeddings_as_dict(chunk_embeddings, chunks)
    save_embeddings_to_pt(embeddings_dict, "embeddings.pt")

In [None]:
create_vectorstore("sentence-transformers/all-MiniLM-L6-v2", chunks)

In [None]:
from sentence_transformers import util

def retrieve_relevant_docs(embed_model, embeddings_path, query, top_k):
    embedder = SentenceTransformer(embed_model)
    embeddings_dict = torch.load(embeddings_path)
    chunks = list(embeddings_dict.keys())
    query_encoded = embedder.encode(query, convert_to_tensor=True)
    top_k = min(top_k, len(embeddings_dict))
    scores = []
    for chunk in tqdm(chunks, desc="Computing Similarity..."):
        cos_score = util.cos_sim(query_encoded, embeddings_dict[chunk])[0]
        scores.append(cos_score)
    scores = torch.Tensor(scores)
    top_k_chunk_indices = torch.topk(scores, k=top_k).indices.tolist()
    top_k_chunks =[chunks[i] for i in top_k_chunk_indices]
    return top_k_chunks

In [None]:
top_k_docs = retrieve_relevant_docs("sentence-transformers/all-MiniLM-L6-v2", "embeddings.pt", "What are the skills specified", 3)

In [None]:
def stuff_docs(relevant_chunks):
    relevant_chunks = [f"Document{i}: {value}\n\n" for i, value in enumerate(relevant_chunks, start=1)]
    stuffed_chunk = "".join(relevant_chunks)
    return stuffed_chunk

In [None]:
stuffed_doc = stuff_docs(top_k_docs)

In [None]:
prompt = """Answer the question based on the given context alone.
context: {context}
question: {question}
answer:"""


In [None]:
import os
from groq import Groq
def qa(llm_name, context, question):
    llm_inp = prompt.format(context=context, question=question)
    client = Groq(api_key="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")
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": llm_inp,
        }
    ],
    model=llm_name
    )

    return chat_completion.choices[0].message.content

In [None]:
qa("llama3-70b-8192", top_k_docs, "What are the skills specified")