In [16]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Initialize the sentence transformer model to generate embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Using a smaller transformer model for speed

# Initialize the Hugging Face question answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2", use_auth_token="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")

# Step 1: Load employee information from Employee_info.txt
def load_employee_info(file_path):
    with open(file_path, 'r') as file:
        employee_info = file.readlines()
    # Clean up the data (strip any extra newlines or whitespace)
    return [line.strip() for line in employee_info]

# Load the employee information from the text file
employee_info = load_employee_info('Employee_info.txt')

# Step 2: Generate embeddings for employee info
employee_embeddings = embedder.encode(employee_info)

# Step 3: Create a FAISS index
dimension = employee_embeddings.shape[1]  # Dimensions of the embeddings
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for simplicity
index.add(np.array(employee_embeddings))

# Step 4: Function to retrieve the most relevant document using FAISS
def retrieve_documents(query, index, embedder):
    # Generate query embedding
    query_embedding = embedder.encode([query])

    # Reshape to (1, dim) for FAISS
    query_embedding = query_embedding.reshape(1, -1)

    # Perform the search in FAISS for top 3 relevant documents
    distances, indices = index.search(query_embedding, k=3)

    # Retrieve the relevant documents based on indices
    relevant_docs = [employee_info[idx] for idx in indices[0]]

    return relevant_docs

# Step 5: Use Hugging Face API to get answer from relevant docs
def generate_answer(query, relevant_docs):
    # Combine the relevant documents into a single context for the LLM
    context = " ".join(relevant_docs)

    # Use Hugging Face QA pipeline to get the answer
    result = qa_pipeline(question=query, context=context)

    return result['answer']

# Step 6: Putting everything together
def answer_query(query, index, embedder, qa_pipeline):
    # Retrieve relevant documents based on the query
    relevant_docs = retrieve_documents(query, index, embedder)

    # Generate the final answer using Hugging Face's question-answering pipeline
    answer = generate_answer(query, relevant_docs)

    return answer

# Example query
user_query = "Who is the highest-paid employee?"

# Get the answer
answer = answer_query(user_query, index, embedder, qa_pipeline)
print(f"Answer: {answer}")


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Answer: Sonia Mittal
