In [None]:
# pip install openai pinecone-client tqdm sentence-transformers pinecone-client

In [None]:
# 📁 Standard libraries
import os
import re
import json
import socket
import ipaddress

# 🧪 Third-party libraries
import numpy as np
from dotenv import load_dotenv 
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# 🌐 Selenium (Web Automation)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

# 🧠 AI/NLP
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 🔍 Vector DB
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm



In [None]:

# Directory where your scraped text files are saved
data_dir = "scraped_pages"

# This will store all your documents
documents = []

# Loop through all text files in that folder
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(data_dir, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            raw_text = f.read().strip()
            
            # Split by lines
            lines = raw_text.split("\n")
            
            # Extract URL from the first line
            url = None
            if lines and lines[0].startswith("URL:"):
                url = lines[0].replace("URL:", "").strip()
                content = "\n".join(lines[1:]).strip()
            else:
                content = raw_text  # fallback if URL line not found
            
            # Save as structured dict
            documents.append({
                "filename": filename,
                "url": url,
                "content": content
            })

In [None]:
# documents

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

chunks = []

for doc in documents:
    split_chunks = splitter.split_text(doc["content"])
    for chunk in split_chunks:
        chunks.append({
            "chunk": chunk,
            "source": doc["filename"],
            "url": doc["url"]
        })

In [None]:
len(chunks)

In [None]:

# ✅ Load the model
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# ✅ Prepare text with "passage: " prefix (required by E5)
texts_to_embed = [f"passage: {chunk['chunk']}" for chunk in chunks]

# ✅ Batch encode all 7,339 chunks at once
vectors = embedding_model.encode(
    texts_to_embed,
    batch_size=64,                 # ⚡ Increase if you have more RAM/GPU
    show_progress_bar=True,
    normalize_embeddings=True      # ✅ Normalize for cosine similarity
)

# ✅ Attach embeddings to your chunks
for i, vec in enumerate(vectors):
    chunks[i]["embedding"] = vec.tolist()  # Optional: convert to list if saving to JSON


In [None]:

load_dotenv(override=True)
# if not load_dotenv():
#     print("Warning: .env file not loaded. Make sure it exists.")

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# Replace with your real values
PINECONE_API_KEY = PINECONE_API_KEY

if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY environment variable not set")
else:
    print("API key found and looks good so far!")


In [None]:

# Initialize client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Configuration
index_name = "chatbot-index"
dimension = 768  # for E5-base-v2
metric = "cosine"

# ✅ Check if index exists, else create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(
            cloud="aws",             # Based on Pinecone environment info
            region="us-east-1"       # your region from Pinecone Console
        )
    )

# ✅ Connect to index
index = pc.Index(index_name)


In [None]:

batch_size = 100  # Upsert in batches (efficient & safe)
batch = []

for i, chunk in enumerate(tqdm(chunks)):
    vector_id = f"chunk-{i}"  # Unique ID for Pinecone

    vector = {
        "id": vector_id,
        "values": chunk["embedding"],  # The 768-dimensional vector from E5
        "metadata": {
            "text": chunk["chunk"],     # Original text chunk
            "source": chunk["source"],  # File or page name
            "url": chunk["url"]         # Optional if you have it
        }
    }

    batch.append(vector)

    # ⬆️ Upload in batches
    if len(batch) == batch_size or i == len(chunks) - 1:
        index.upsert(vectors=batch)
        batch = []  # Reset for next batch


In [None]:
# Check index statistics
stats = index.describe_index_stats()
print(stats)


In [None]:
# Example: fetch first 3 vectors
ids_to_fetch = [f"chunk-{i}" for i in range(3)]

fetched = index.fetch(ids=ids_to_fetch)
print(fetched)


In [None]:
load_dotenv(override=True)

API_KEY = os.getenv("GROQ_API_KEY")
BASE_URL = "https://api.groq.com/openai/v1"

if not API_KEY:
    print("No API_Key found, Please set the API_KEY.")
    exit(1)
elif API_KEY.strip() != API_KEY:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


In [None]:
groq_client = OpenAI(api_key=API_KEY, base_url =BASE_URL)
MODEL = "llama3-70b-8192"

In [None]:
chat_history = []

model = SentenceTransformer("intfloat/e5-base-v2")

while True:
    query = input("You: ")
    if query.lower() == "exit":
        break
    
    query_vector = model.encode(f"query: {query}", normalize_embeddings=True).tolist()

    results = index.query(
        vector=query_vector,
        top_k=10,  # Get top 5 most relevant chunks
        include_metadata=True
    )

    context = ""
    for match in results["matches"]:
        context += match["metadata"]["text"].strip() + "\n---\n"

    
    # 🔁 Add memory to prompt
    history_string = ""
    for turn in chat_history[-3:]:  # Use last 3 Q&As (for brevity)
        history_string += f"User: {turn['question']}\nAssistant: {turn['answer']}\n"

    # Final prompt for the LLM
    prompt = f"""
    You are a highly accurate and honest assistant.
    Use ONLY the context below to answer the question.
    If the answer is not found in the context, say "I don't know."

    Context:
    {context}

    Question: {query}
    Answer:
    """

    try:
        streamed_response = groq_client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant who answers clearly and concisely."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,
            # stream=True   
        )
        answer = streamed_response.choices[0].message.content
        # return answer

        # Save to memory
        chat_history.append({"question": query, "answer": answer})
        
        print("\n🤖 Bot:", answer)

        # result = ""
        # for chunk in streamed_response:
        #     content_piece = chunk.choices[0].delta.content or ""
        #     result += content_piece
        #     cleaned_result = result.replace("```", "").replace("markdown", "")
        #     yield cleaned_result  # <- Streaming to Gradio

    except Exception as e:
        print(f"[LLM Error] {e}")



In [None]:
# def get_response(query: str, index, chat_history: list) -> str:
#     """Encode query, retrieve context, call LLM, and return answer."""
#     query_vec = model.encode(f"query: {query}", normalize_embeddings=True).tolist()
#     results = index.query(vector=query_vec, top_k=5, include_metadata=True)
#     context = "\n---\n".join(match["metadata"]["text"] for match in results["matches"])
#     prompt = f"Use the following context to answer the question:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
#     try:
#         resp = groq_client.chat.completions.create(
#             model=MODEL,
#             messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": prompt}]
#         )
#         answer = resp.choices[0].message.content.strip()
#     except Exception as e:
#         answer = f"An error occurred: {e}"
#     chat_history.append({"question": query, "answer": answer})
#     return answer

# # Usage:
# chat_history = []
# while True:
#     query = input("You: ")
#     if query.lower() == "exit":
#         break
#     answer = get_response(query, index, chat_history)
#     print("🤖 Bot:", answer)
