In [38]:
###Task 1: RAG QA Bot Implementation

In [76]:
!pip install -qU langchain-openai langchain-pinecone tiktoken fpdf
!pip install -q "unstructured[md]" pdf2image pypdf

In [77]:
import os
import time
from fpdf import FPDF
from google.colab import userdata
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from pinecone import Pinecone,ServerlessSpec

In [78]:
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [79]:
# SAMPLE DOCUMENT
def create_sample_policy_pdf():
    """Generate a sample business policy PDF"""
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Title
    pdf.set_font("Arial", 'B', 16)
    pdf.cell(200, 10, txt="GlobalTech Solutions - Company Policies", ln=1, align='C')
    pdf.ln(10)

    # Policies
    sections = [
        ("1. Refund Policy", [
            "1.1 Standard refund window: 30 days from purchase date",
            "1.2 Exceptions require manager approval (case-by-case basis)",
            "1.3 Refund processing time: 5-7 business days"
        ]),
        ("2. Shipping Policy", [
            "2.1 Standard shipping: 3-5 business days",
            "2.2 Express shipping: 1-2 business days ($15 extra)",
            "2.3 International shipping: 7-10 business days"
        ]),
        ("3. Employee Benefits", [
            "3.1 Health insurance: Cigna PPO plan",
            "3.2 Vacation: 15 days/year + public holidays",
            "3.3 Remote work: Up to 3 days/week after probation"
        ]),
        ("4. Contact Information", [
            "Support: support@globaltech.example (24/7)",
            "HR: hr@globaltech.example",
            "Office: 123 Tech Blvd, San Francisco"
        ]),
        ("5. Service Level Agreement", [
            "5.1 Uptime guarantee: 99.9% monthly availability",
            "5.2 Response time: <1 hour for critical issues",
            "5.3 Resolution time: <24 hours for P1 incidents"
        ])
    ]

    for title, content in sections:
        pdf.set_font("Arial", 'B', 14)
        pdf.cell(200, 10, txt=title, ln=1)
        pdf.set_font("Arial", size=12)
        for item in content:
            pdf.multi_cell(0, 8, txt=item)
        pdf.ln(5)

    pdf_file = "business_policies.pdf"
    pdf.output(pdf_file)
    return pdf_file

In [80]:
# DOCUMENT PROCESSING
def load_and_split_documents(file_path):
    """Load and split PDF documents"""
    loader = PyPDFLoader(file_path)
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True
    )
    return text_splitter.split_documents(docs)

In [85]:
# VECTOR DATABASE SETUP
def setup_vector_store(index_name, documents):
    """Create Pinecone vector store"""
    embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=1024,
    encoding_format="float"
    )
    return PineconeVectorStore.from_documents(
        documents,
        embeddings,
        index_name=index_name
    )


In [86]:
# QA SYSTEM SETUP
def create_qa_system(vector_store):
    """Create Retrieval QA system"""
    prompt_template = """You're a business assistant for GlobalTech Solutions.
    Answer questions using ONLY the provided context. Be professional and concise.
    If the answer isn't in the context, say "I don't have that information".

    Context:
    {context}

    Question: {question}
    Answer:"""

    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 4}),
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True
    )



In [87]:
# UTILITY FUNCTIONS
def format_source_documents(docs):
    """Format retrieved documents for display"""
    formatted = []
    for i, doc in enumerate(docs, 1):
        page = doc.metadata.get('page', 0) + 1
        content = doc.page_content.replace('\n', ' ')[:120] + "..."
        formatted.append(f"{i}. 📄 Page {page}: {content}")
    return "\n".join(formatted)

def get_followup_suggestion(query):
    """Generate context-aware follow-up suggestions"""
    query = query.lower()
    if "refund" in query:
        return "💡 Try asking: 'What are the refund exceptions?'"
    elif "shipping" in query:
        return "💡 Try asking: 'What are international shipping times?'"
    elif "employee" in query or "benefit" in query:
        return "💡 Try asking: 'What is our remote work policy?'"
    elif "contact" in query:
        return "💡 Try asking: 'How do I reach HR?'"
    elif "sla" in query or "service" in query:
        return "💡 Try asking: 'What's our uptime guarantee?'"
    return "💡 Try asking about our policies: refunds, shipping, or employee benefits"


In [88]:
# MAIN EXECUTION
if __name__ == "__main__":
    print("🚀 Initializing Business QA System...")

    # Step 1: sample document
    print("📄 Creating sample business policy document...")
    policy_pdf = create_sample_policy_pdf()
    print(f"✅ Created: {policy_pdf}")

    # Step 2: Initialize Pinecone
    print("🌲 Connecting to Pinecone vector database...")
    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

    # Step 3: Create/load index
    INDEX_NAME = "business-qa-rag"

    # Check if index exists
    if INDEX_NAME not in pc.list_indexes().names():
        print("🆕 Creating new Pinecone index...")
        pc.create_index(
            name=INDEX_NAME,
            dimension=1024,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
        print("⏳ Waiting for index initialization...")
        time.sleep(60)
    print(f"✅ Using index: {INDEX_NAME}")

    # Step 4: Process document
    print("🔍 Processing document...")
    document_chunks = load_and_split_documents(policy_pdf)
    print(f"📚 Loaded {len(document_chunks)} document chunks")

    # Step 5: Create vector store
    print("🧠 Creating vector store...")
    vector_db = setup_vector_store(INDEX_NAME, document_chunks)

    # Step 6: Create QA system
    print("⚙️ Initializing QA engine...")
    qa_bot = create_qa_system(vector_db)
    print("✅ System ready!\n")

    # CONVERSATION INTERFACE
    sample_questions = [
        "What's the standard refund window?",
        "How much does express shipping cost?",
        "What health insurance do we offer?",
        "How many vacation days do employees get?",
        "What's our uptime guarantee?",
        "Where is our office located?"
    ]

    print("="*60)
    print("🤖 Welcome to the GlobalTech Business Assistant!")
    print("="*60)
    print("\nTry these sample questions or ask your own:\n")
    for i, q in enumerate(sample_questions, 1):
        print(f"{i}. {q}")

    conversation_history = []

    while True:
        print("\n" + "-"*60)
        try:
            # Get user input
            query = input("\nYour business question (type 'exit' to quit): ").strip()

            # Exit condition
            if query.lower() in ['exit', 'quit', 'bye']:
                print("\n👋 Session ended. Have a productive day!")
                break

            if not query:
                print("⚠️ Please enter a question")
                continue

            # Process query
            start_time = time.time()
            result = qa_bot.invoke({"query": query})
            response_time = time.time() - start_time

            # Store conversation
            conversation_history.append({
                "query": query,
                "answer": result['result'],
                "sources": result['source_documents'],
                "response_time": f"{response_time:.2f}s"
            })

            # Display response
            print(f"\n🤖 ANSWER ({response_time:.2f}s): {result['result']}")

            # Show sources
            if result['source_documents']:
                print("\n🔍 SOURCES:")
                print(format_source_documents(result['source_documents']))

            # Follow-up suggestion
            print(f"\n{get_followup_suggestion(query)}")

        except KeyboardInterrupt:
            print("\n🛑 Session interrupted")
            break
        except Exception as e:
            print(f"\n❌ Error: {str(e)}")
            print("Please try a different question or rephrase")

    # Optional: Show conversation history
    if conversation_history:
        print("\n📜 Conversation History:")
        for i, exchange in enumerate(conversation_history, 1):
            print(f"\n{i}. Q: {exchange['query']}")
            print(f"   A: {exchange['answer']} ({exchange['response_time']})")

🚀 Initializing Business QA System...
📄 Creating sample business policy document...
✅ Created: business_policies.pdf
🌲 Connecting to Pinecone vector database...
✅ Using index: business-qa-rag
🔍 Processing document...
📚 Loaded 1 document chunks
🧠 Creating vector store...


                    encoding_format was transferred to model_kwargs.
                    Please confirm that encoding_format is what you intended.


⚙️ Initializing QA engine...
✅ System ready!

🤖 Welcome to the GlobalTech Business Assistant!

Try these sample questions or ask your own:

1. What's the standard refund window?
2. How much does express shipping cost?
3. What health insurance do we offer?
4. How many vacation days do employees get?
5. What's our uptime guarantee?
6. Where is our office located?

------------------------------------------------------------

Your business question (type 'exit' to quit): What's the standard refund window?

🤖 ANSWER (1.57s): I don't have that information.

💡 Try asking: 'What are the refund exceptions?'

------------------------------------------------------------

Your business question (type 'exit' to quit): What health insurance do we offer?

🤖 ANSWER (2.86s): GlobalTech Solutions offers a Cigna PPO plan for health insurance.

🔍 SOURCES:
1. 📄 Page 1.0: GlobalTech Solutions - Company Policies 1. Refund Policy 1.1 Standard refund window: 30 days from purchase date 1.2 Exce...

💡 Try askin