# Task 2:
### Interactive QA Bot Interface
#### Problem Statement:
#### Develop an interactive interface for the QA bot from Part 1, allowing users to input queries and retrieve answers in real time. The interface should enable users to upload documents and ask questions based on the content of the uploaded document.
Task Requirements:
1. Build a simple frontend interface using Streamlit or Gradio, allowing users to
upload PDF documents and ask questions.
2. Integrate the backend from Part 1 to process the PDF, store document embeddings,
and provide real-time answers to user queries.
3. Ensure that the system can handle multiple queries efficiently and provide accurate,
contextually relevant responses.
4. Allow users to see the retrieved document segments alongside the generated
answer.


In [None]:
# Setup and Requirements
!pip install -q PyPDF2 pinecone-client cohere transformers torch

In [None]:
# import libraries
import PyPDF2
import pinecone
import cohere
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Extract pdf
def extract_text_from_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text()
    return text

# Initialize pinecone
import os
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

# Reset index
pc.delete_index('rag')

# Create index
if 'rag' not in pc.list_indexes().names():
          pc.create_index(
              name='rag',
              dimension=384,
              metric='cosine',
              spec=ServerlessSpec(
                  cloud='aws',
                  region='us-east-1'
              )
          )

# Vector Database Setup
index = pc.Index('rag')

# Initialize Cohere for text generation (alternatively, GPT-3/4 API can be used)
cohere_client = cohere.Client(api_key="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

# Load a pre-trained embedding model from Hugging Face (e.g., sentence-transformers)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def generate_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        model_output = model(**inputs)
    embeddings = model_output.last_hidden_state.mean(dim=1)  # Average pooling

    # Convert to numpy array and cast to float32
    embeddings_np = embeddings.numpy().astype(np.float32)

    # L2 normalization (make sure the norm of the vector is 1)
    norms = np.linalg.norm(embeddings_np, axis=1, keepdims=True)
    normalized_embeddings = embeddings_np / norms  # Apply L2 normalization

    return normalized_embeddings

# Step 1: Upload PDF
uploaded_file = 'Patten_Audio.pdf'
document_text = extract_text_from_pdf(uploaded_file)
print("Document uploaded successfully!")

# Step 2: Generate embeddings and store in Pinecone
document_segments = document_text.split(". ")  # Split the document into sentences
# Insert documents into Pinecone with their embeddings
for i, segment in enumerate(document_segments):
      embedding = generate_embeddings([segment])[0].tolist()
      index.upsert([(f"doc_{i}", embedding, {"text": segment})])  # Store the embedding in Pinecone

print(f"Stored {len(document_segments)} document segments in Pinecone.")

# Step 3: Query Input
query = input("Ask a question based on the document:")

Document uploaded successfully!
Stored 621 document segments in Pinecone.
Ask a question based on the document:What is canonical babbling ?


In [None]:
if query and uploaded_file:
    # Retrieve relevant document segments from Pinecone
    def retrieve_relevant_docs(query, top_k=3):
        query_embedding = generate_embeddings([query])[0].tolist()
        print(query_embedding)
        print(index)
        results = index.query(vector=[query_embedding], top_k=top_k)
        relevant_docs = []
        if 'matches' in results and results['matches']:
            for match in results['matches']:
                doc_id = match['id']
                doc_index = int(doc_id.split("_")[1])  # Assuming "doc_X" format
                relevant_docs.append(document_segments[doc_index])
        else:
              print("No matches found in the query results.")

        return relevant_docs

    # Generate the answer using Cohere or any generative model
    def generate_answer(query, relevant_docs):
        context = "\n".join(relevant_docs)
        prompt = f"Question: {query}\n\nContext:\n{context}\n\nAnswer:"
        response = cohere_client.generate(
            model="command-nightly",
            prompt=prompt,
            max_tokens=700,
            temperature=0.5
        )
        return response.generations[0].text.strip()

    # QA Bot Function
    def qa_bot(query):
        # Retrieve relevant documents based on the query
        relevant_docs = retrieve_relevant_docs(query)

        # Generate a coherent answer using Cohere
        answer = generate_answer(query, relevant_docs)
        return answer, relevant_docs

    # Display the generated answer
    answer, relevant_docs = qa_bot(query)
    print("Answer :", answer)
    print("\nRelevant Documents :\n", relevant_docs)

[0.07201150804758072, -0.09017466008663177, 0.021662145853042603, -0.021200956776738167, -0.04220250993967056, 0.02661280520260334, 0.05614350363612175, -0.02027919702231884, 0.026782838627696037, -0.03853462263941765, 0.004435716662555933, -0.0008672524127177894, 0.05640381574630737, 0.007774443831294775, -0.005962664261460304, 0.0549413226544857, -0.022355131804943085, 0.019142666831612587, 0.02189999260008335, 0.013394000008702278, 0.058959830552339554, 0.0788196325302124, -0.004398447461426258, 0.031646035611629486, 0.0011076133232563734, 0.04837500676512718, -0.003229165682569146, -0.09509169310331345, 0.08223120868206024, -0.01595013402402401, -0.055324163287878036, 0.11205848306417465, 0.04486626759171486, 0.010925263166427612, -0.00704124616459012, 0.007775326259434223, 0.017140228301286697, 0.04820593073964119, 0.04448217153549194, 0.009454267099499702, -0.001314355293288827, 0.007862546481192112, -0.052833523601293564, -0.0014009616570547223, 0.05132395774126053, 0.0360013283

In [None]:
# Display the generated answer
query = 'What is ASD?'
answer, relevant_docs = qa_bot(query)
print("Answer :", answer)
print("\nRelevant Documents :\n", relevant_docs)

[-0.08737751096487045, -0.0333377979695797, -0.03209114074707031, 0.058546263724565506, -0.022375989705324173, 0.011384046636521816, 0.022838765755295753, 0.007417604792863131, -0.0005669491947628558, -0.005664408672600985, 0.057224906980991364, 0.08533212542533875, -0.052032146602869034, -0.0006254636682569981, -0.09298144280910492, -0.03991258889436722, 0.07842227816581726, -0.043576374650001526, -0.06257059425115585, 0.01376315113157034, 0.009178567677736282, 0.11279145628213882, 0.008263758383691311, 0.014043862000107765, 0.07279136031866074, -0.01552728284150362, 0.021861186251044273, -0.10999882221221924, 0.025942817330360413, -0.04288375377655029, 0.04842459782958031, -0.024141423404216766, 0.05838273465633392, 0.012754484079778194, -0.014084471389651299, 0.009031779132783413, 0.06440770626068115, 0.002429636660963297, -0.020844003185629845, 0.057898830622434616, -0.04483760520815849, -0.01785344071686268, 0.003112099366262555, 0.022858988493680954, -0.025895366445183754, -0.009