In [17]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Initialize the sentence transformer model to generate embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Using a smaller transformer model for speed

# Initialize the Hugging Face question answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2", use_auth_token="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")

# Step 1: Load content from the provided text files
def load_text_files(file_paths):
    content = []
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            content.append(file.read().strip())  # Add the cleaned text to the content list
    return content

# List of file paths
files = [
    '05-03-checks-the-ai-powered-data-protection-project-incubated-in-area-120-officially-exits-to-google.txt',
    '05-06-amazon-launches-free-channels-check-marks-come-to-gmail-and-openai-raises-more-moolah.txt',
    '05-07-fintech-space-continues-to-be-competitive-and-drama-filled.txt'
]

# Load the text content from the files
document_texts = load_text_files(files)

# Step 2: Generate embeddings for the document content
document_embeddings = embedder.encode(document_texts)

# Step 3: Create a FAISS index for fast retrieval
dimension = document_embeddings.shape[1]  # Dimensions of the embeddings
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for simplicity
index.add(np.array(document_embeddings))  # Add document embeddings to the index

# Step 4: Function to retrieve the most relevant document based on the query
def retrieve_documents(query, index, embedder):
    # Generate query embedding
    query_embedding = embedder.encode([query])

    # Reshape to (1, dim) for FAISS
    query_embedding = query_embedding.reshape(1, -1)

    # Perform the search in FAISS for top 1 relevant document
    distances, indices = index.search(query_embedding, k=1)

    # Retrieve the relevant document based on index
    relevant_doc = document_texts[indices[0][0]]

    return relevant_doc

# Step 5: Use Hugging Face API to get the answer from the retrieved document
def generate_answer(query, relevant_doc):
    # Use Hugging Face QA pipeline to get the answer from the relevant document
    result = qa_pipeline(question=query, context=relevant_doc)

    return result['answer']

# Step 6: Putting everything together
def answer_query(query, index, embedder, qa_pipeline):
    # Retrieve relevant document based on the query
    relevant_doc = retrieve_documents(query, index, embedder)

    # Generate the final answer using Hugging Face's question-answering pipeline
    answer = generate_answer(query, relevant_doc)

    return answer

# Example query
user_query = "What is the main focus of the Checks AI project?"

# Get the answer
answer = answer_query(user_query, index, embedder, qa_pipeline)
print(f"Answer: {answer}")


Answer: privacy rules and regulations
