In [1]:
from langchain_openai import OpenAIEmbeddings
from langchain.evaluation import load_evaluator
from dotenv import load_dotenv
import openai
import os
import time

# Load environment variables. Assumes that project contains .env file with API keys
load_dotenv("config.env")
openai.api_key = os.environ["OPENAI_API_KEY"]

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain.schema import Document
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
import shutil

CHROMA_PATH = "input_data/chroma"
DATA_PATH = "input_data/"



In [2]:
def CompareTwoWords(w1, w2):
    # Get embedding for a word.
    embedding_function = OpenAIEmbeddings()
    vector = embedding_function.embed_query(w1)
    print(f"Vector for 'apple': {vector}")
    print(f"Vector length: {len(vector)}")

    # Compare vector of two words
    evaluator = load_evaluator("pairwise_embedding_distance")
    words = (w1, w2)
    x = evaluator.evaluate_string_pairs(prediction=words[0], prediction_b=words[1])
    print(f"Comparing ({words[0]}, {words[1]}): {x}")

In [3]:
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()
    return documents

In [4]:
import nltk

# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')

documents = load_documents()

In [5]:
documents

[Document(metadata={'source': 'input_data/overview.txt'}, page_content="Overview\n\nHackerEarth is an online technical-recruitment tool that enables you to automate your hiring process to select the best fit for your team.\n\nUsing HackerEarth Assessment, you can create tests to evaluate candidates. By automating your recruitment process, HackerEarth Assessment saves the time and effort required to go through hundreds of resumes manually. It allows you to assess and shortlist a targeted pool of candidates who suit your requirements.\n\nThis article gives you an in-depth understanding of HackerEarth Assessment and its benefits. By using HackerEarth, you can streamline the recruitment process and find the perfect candidate for your team.\n\nWhy should you use HackerEarth Assessment?\n\nHackerEarth provides you with industry-leading features which are as follows:\n\n1. HackerEarth's rich library\n\nHackerEarth's library is a repository of pre-built questions that can be used to create cus

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=40,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(documents)
print(f"Length of chunks = {len(chunks)}")

Length of chunks = 10


In [7]:
[d.page_content for d in chunks]

['Overview\n\nHackerEarth is an online technical-recruitment tool that enables you to automate your hiring process to select the best fit for your team.\n\nUsing HackerEarth Assessment, you can create tests to evaluate candidates. By automating your recruitment process, HackerEarth Assessment saves the time and effort required to go through hundreds of resumes manually. It allows you to assess and shortlist a targeted pool of candidates who suit your requirements.\n\nThis article gives you an in-depth understanding of HackerEarth Assessment and its benefits. By using HackerEarth, you can streamline the recruitment process and find the perfect candidate for your team.\n\nWhy should you use HackerEarth Assessment?\n\nHackerEarth provides you with industry-leading features which are as follows:',
 "1. HackerEarth's rich library\n\nHackerEarth's library is a repository of pre-built questions that can be used to create customized tests for assessing candidates. The library contains a vast c

In [8]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [9]:
# Clear out the database first.
# if os.path.exists(CHROMA_PATH):
#     shutil.rmtree(CHROMA_PATH)

In [10]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedder)

last_request_time = 0
RATE_LIMIT_INTERVAL = 10

chunks_with_ids = calculate_chunk_ids(chunks)

# Add or Update the documents.
existing_items = db.get(include=[])  # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")

# Only add documents that don't exist in the DB.
new_chunks = []
for chunk in chunks_with_ids:
    if chunk.metadata["id"] not in existing_ids:
        new_chunks.append(chunk)

if len(new_chunks):
    print(f"Adding new documents: {len(new_chunks)}")
    new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
    db.add_documents(new_chunks, ids=new_chunk_ids)
    db.persist()
else:
    print("No new documents to add")

# for i, chunk in enumerate(chunks):
#     print(i)
#     current_time = time.time()
#     if current_time - last_request_time < RATE_LIMIT_INTERVAL:
#         time.sleep(RATE_LIMIT_INTERVAL - (current_time - last_request_time))
#     last_request_time = current_time
#     # Create a new DB from the documents.
#     max_retries = 2
#     for attempt in range(max_retries):
#         try:
#             db = Chroma.from_documents(
#                 [chunk], embedder, persist_directory=CHROMA_PATH
#                 )
#         except Exception as e:
#             if attempt == max_retries - 1:
#                 raise e
#             time.sleep(1 * (2**attempt))

# db.persist()

print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


Number of existing documents in DB: 0
Adding new documents: 10
Saved 10 chunks to input_data/chroma.


  warn_deprecated(


In [11]:
# Query the RAG
query_text = "Why should you use HackerEarth Assessment?"

embedding_function = embedder
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Search the DB.
results = db.similarity_search_with_score(query_text, k=5)
print(results)

[(Document(metadata={'id': 'input_data/overview.txt:None:3', 'source': 'input_data/overview.txt'}, page_content="HackerEarth Assessment is designed to help you assess a large pool of candidates in a concise span of time. This means that you can quickly filter through a high volume of resumes and identify the best fit for your team. With HackerEarth Assessment, you can increase the number of candidates you assess, which can help you find the perfect candidate for your organization. This is particularly useful when you have multiple positions to fill or when you are looking to scale your team quickly. By automating the recruitment process, HackerEarth Assessment saves you time and effort and enables you to assess more candidates efficiently. This not only helps you find the right candidate faster but also ensures that you don't miss out on any potential hires.\n\n3. Diverse platform"), 0.5475894099918224), (Document(metadata={'id': 'input_data/overview.txt:None:5', 'source': 'input_data/

In [12]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

PROMPT_TEMPLATE = """
Answer the question based only on the following question:

{question}
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(question=query_text)
# print(prompt)

# model = Ollama(base_url='http://192.168.0.200:11434', model="qwen:1.8b")
model = Ollama(base_url="http://localhost:11434", model="qwen:1.8b")
response_text = model(prompt)

# sources = [doc.metadata.get("id", None) for doc, _score in results]
formatted_response = f"Response: {response_text}"
print(formatted_response)

  warn_deprecated(


Response: You should use HackerEarth Assessment because it offers a variety of programming challenges that cater to different levels of experience and skill.

Here are some reasons why you should consider using HackerEarth Assessment:

1. Real-world problems: HackerEarth provides a wide range of coding challenges that simulate real-world problems, such as optimizing website performance, building mobile applications, and developing algorithms for data mining and machine learning.

2. Difficulty level: HackerEarth assessments vary in difficulty level from beginner to advanced levels. This allows you to select an assessment that matches your skill level and experience.

3. Accessibility: HackerEarth is accessible from anywhere with internet access, making it convenient for students who are studying remotely or who have other work commitments.

4. Collaboration: HackerEarth provides a collaborative platform where students can submit their coding challenges in real-time, allowing them to co

In [13]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])


RAG_PROMPT_TEMPLATE = """
Answer the question based on the following context:

{context}

---

Answer the question based on the above context: {question}
Do not repeat any information and be precise in your response.
"""

prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
# print(prompt)

# model = Ollama(model="mistral")
model = Ollama(base_url="http://localhost:11434", model="qwen:1.8b")
response_text = model(prompt)

sources = [doc.metadata.get("id", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response: HackerEarth Assessments provide several advantages for organizations seeking to streamline their recruitment processes and find the perfect candidate for their team.

  1. Industry- Leading Features: HackerEarth Assessments offer industry-leading features that are designed to streamline recruitment processes, reduce time spent reviewing resumes, and identify the best candidates for a company's team.
  
  2. Automated Recruitment Process: HackerEarth Assessments provide an automated recruitment process that allows organizations to shortlist potential candidates based on their specific requirements and preferences.
  
  3. Candidate Matching: HackerEarth Assessments provide candidate matching capabilities that allow organizations to quickly identify and shortlist potential candidates based on their specific requirements, preferences, and experience levels.
  

Therefore, using HackerEarth Assessments provides several advantages for organizations seeking to streamline their recr

In [14]:
print(context_text)

HackerEarth Assessment is designed to help you assess a large pool of candidates in a concise span of time. This means that you can quickly filter through a high volume of resumes and identify the best fit for your team. With HackerEarth Assessment, you can increase the number of candidates you assess, which can help you find the perfect candidate for your organization. This is particularly useful when you have multiple positions to fill or when you are looking to scale your team quickly. By automating the recruitment process, HackerEarth Assessment saves you time and effort and enables you to assess more candidates efficiently. This not only helps you find the right candidate faster but also ensures that you don't miss out on any potential hires.

3. Diverse platform

---

Advanced proctoring settings for assessments ensure that the tests are conducted in a fair and unbiased manner. HackerEarth Assessment provides best-in-class proctoring mechanisms to prevent cheating and ensure the 

In [15]:
# embeddings = OllamaEmbeddings(
#     base_url="http://192.168.0.200:11434", model="nomic-embed-text"
# )