In [17]:
import pandas as pd
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.schema.document import Document
import faiss
from openai import OpenAI
import numpy as np


In [18]:

# Set your OpenAI API key (assumed to be stored in an environment variable)
from dotenv import load_dotenv
import os


In [19]:

load_dotenv()  # Load environment variables from .env file

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # Retrieve the key



In [20]:
openai_client=OpenAI(api_key=OPENAI_API_KEY)

In [21]:
def normalize(vectors):
    """
    Normalize a list of vectors.

    Args:
        vectors (list): A list of vectors to normalize.

    Returns:
        list: A list of normalized vectors.
    """
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

In [22]:
import PyPDF2

def extract_pdf_text_and_page_info(pdf_path):
    """
    Extracts text from each page of a PDF and returns a dictionary with page number and text.

    Args:
        pdf_path: The path to the PDF file.

    Returns:
        A dictionary where keys are page numbers (starting from 1) and values are the text of that page.
    """
    pdf_text_by_page = {}
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        for page_num in range(1, num_pages + 1):  # Page numbers start from 1
            page_obj = pdf_reader.pages[page_num - 1]  # Adjust for zero-based indexing
            page_text = page_obj.extract_text()
            pdf_text_by_page[page_num] = page_text

    return pdf_text_by_page


# Function to extract text from Word files
def extract_docx_text_and_page_info(docx_path):
    doc = DocxDocument(docx_path)
    text_by_page = {}

    paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]  # Remove empty lines
    full_text = "\n".join(paragraphs)

    # Treat the document as a single "page"
    text_by_page[1] = full_text
    return text_by_page


In [23]:
#creating a FAISS index
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
index = faiss.IndexFlatL2(len(embeddings.embed_query("")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [24]:

# List your documents here (adjust paths as needed)
"""document_paths = [os.path.join("data", f) for f in os.listdir("data") if f.endswith((".pdf"))]"""

document_paths = [
    os.path.join("data", f) for f in os.listdir("data") if f.endswith((".pdf", ".docx"))
]


In [25]:
"""for pdf_file_path in document_paths:
    # Extract text and page info from the PDF
    pdf_data = extract_pdf_text_and_page_info(pdf_file_path)

    for key, value in pdf_data.items():
        docs = [Document(page_content=value, metadata={"page_number": key, "source": pdf_file_path})]
        embedding = embeddings.embed_query(value)
        normalized_embedding = normalize([embedding])[0]
        vector_store.add_documents(documents=docs, embeddings=[normalized_embedding])"""


for file_path in document_paths:
    # Extract text based on file type
    if file_path.endswith(".pdf"):
        file_data = extract_pdf_text_and_page_info(file_path)
    elif file_path.endswith(".docx"):
        file_data = extract_docx_text_and_page_info(file_path)

    # Process extracted text and add to vector store
    for page_num, text in file_data.items():
        docs = [Document(page_content=text, metadata={"page_number": page_num, "source": file_path})]
        embedding = embeddings.embed_query(text)
        normalized_embedding = normalize([embedding])[0]
        vector_store.add_documents(documents=docs, embeddings=[normalized_embedding])

In [26]:
def query_response(query):
    embedding = embeddings.embed_query(query)
    normalized_embedding = normalize([embedding])[0]
    #result_with_score = vector_store.similarity_search_with_score(query, k=1, embeddings=[normalized_embedding])
    result_with_score = vector_store.similarity_search_by_vector(normalized_embedding, k=1)
    prompt =f"""
    You are a helpful assistan. Your task is to understand the rag context along with the user query and then reply to the user based on the context.
    The user query is: {query}
    The context from doing rag based on the user query is: {result_with_score[0].page_content}
    Your task is to reply to the user based on the context and the user query.
    """
    output_response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=50
    )
    llm_response = output_response.choices[0].message.content
    return llm_response


In [28]:
#### RUN QUERY HERE ####
query = "Summarise Transcript of Mandar Bandve"
response = query_response(query)
print(response)  # Print the response from the model

The transcript of Mandar Badve's conversation primarily revolves around his experience and insights into using the autogen framework for building agent frameworks. Mandar has 14 years of experience as a software engineer and is currently focused on developing agent frameworks and tools using
