In [None]:
# 📁 Standard libraries
import os
import re
import json
import socket
import ipaddress

# 🧪 Third-party libraries
import numpy as np
from dotenv import load_dotenv 
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# 🌐 Selenium (Web Automation)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

# 🧠 AI/NLP
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 🔍 Vector DB
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm



In [4]:

# Directory where your scraped text files are saved
data_dir = "scraped_pages"

# This will store all your documents
documents = []

# Loop through all text files in that folder
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(data_dir, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            raw_text = f.read().strip()
            
            # Split by lines
            lines = raw_text.split("\n")
            
            # Extract URL from the first line
            url = None
            if lines and lines[0].startswith("URL:"):
                url = lines[0].replace("URL:", "").strip()
                content = "\n".join(lines[1:]).strip()
            else:
                content = raw_text  # fallback if URL line not found
            
            # Save as structured dict
            documents.append({
                "filename": filename,
                "url": url,
                "content": content
            })

In [5]:
# documents

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

chunks = []

for doc in documents:
    split_chunks = splitter.split_text(doc["content"])
    for chunk in split_chunks:
        chunks.append({
            "chunk": chunk,
            "source": doc["filename"],
            "url": doc["url"]
        })

In [7]:
len(chunks)

7339

In [27]:

# ✅ Load the model
embedding_model = SentenceTransformer("intfloat/e5-base-v2")

# ✅ Prepare text with "passage: " prefix (required by E5)
texts_to_embed = [f"passage: {chunk['chunk']}" for chunk in chunks]

# ✅ Batch encode all 7,339 chunks at once
vectors = embedding_model.encode(
    texts_to_embed,
    batch_size=64,                 # ⚡ Increase if you have more RAM/GPU
    show_progress_bar=True,
    normalize_embeddings=True      # ✅ Normalize for cosine similarity
)

# ✅ Attach embeddings to your chunks
for i, vec in enumerate(vectors):
    chunks[i]["embedding"] = vec.tolist()  # Optional: convert to list if saving to JSON


Batches:   0%|          | 0/115 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:

load_dotenv(override=True)
# if not load_dotenv():
#     print("Warning: .env file not loaded. Make sure it exists.")

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# Replace with your real values
PINECONE_API_KEY = PINECONE_API_KEY

if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY environment variable not set")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [None]:

# Initialize client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Configuration
index_name = "chatbot-index"
dimension = 768  # for E5-base-v2
metric = "cosine"

# ✅ Check if index exists, else create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(
            cloud="aws",             # Based on Pinecone environment info
            region="us-east-1"       # your region from Pinecone Console
        )
    )

# ✅ Connect to index
index = pc.Index(index_name)


In [None]:

batch_size = 100  # Upsert in batches (efficient & safe)
batch = []

for i, chunk in enumerate(tqdm(chunks)):
    vector_id = f"chunk-{i}"  # Unique ID for Pinecone

    vector = {
        "id": vector_id,
        "values": chunk["embedding"],  # The 768-dimensional vector from E5
        "metadata": {
            "text": chunk["chunk"],     # Original text chunk
            "source": chunk["source"],  # File or page name
            "url": chunk["url"]         # Optional if you have it
        }
    }

    batch.append(vector)

    # ⬆️ Upload in batches
    if len(batch) == batch_size or i == len(chunks) - 1:
        index.upsert(vectors=batch)
        batch = []  # Reset for next batch


100%|██████████| 7339/7339 [01:25<00:00, 85.72it/s] 


In [20]:
# Check index statistics
stats = index.describe_index_stats()
print(stats)


{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 7339}},
 'total_vector_count': 7339,
 'vector_type': 'dense'}


In [21]:
# Example: fetch first 3 vectors
ids_to_fetch = [f"chunk-{i}" for i in range(3)]

fetched = index.fetch(ids=ids_to_fetch)
print(fetched)


FetchResponse(namespace='', vectors={'chunk-0': Vector(id='chunk-0', values=[-0.0106099034, -0.00864486303, -0.0387433507, -0.043938525, 0.0633867756, -0.0390194878, 0.0066082906, 0.048261147, -0.0313188434, -0.0260302089, 0.0164223239, 0.0581865571, -0.0173711032, -0.00773743819, -0.0483874753, 0.0336582065, 0.0331584737, 0.022745844, 0.0498974845, -0.0338597298, -0.0186276268, -0.0377601311, 0.0167887714, 0.0185805745, -0.0166405775, 0.0276364014, 0.00794698764, 0.0644459426, -0.0754811838, -0.0448967293, 0.0353211276, 0.0438374244, -0.0116759185, -0.0616721176, -0.0451578908, 0.00647079479, -0.0518077575, -0.0151456399, -0.0379987471, -0.0182747412, 0.00688091805, -0.0344539, -0.0240712017, -0.0131164482, -0.0345140547, -0.0188526567, -0.0644629523, 0.0324888118, -0.0386358537, -0.0400037, -0.0221779328, 0.0447198711, 0.0410682708, 0.0119176675, -0.0240652524, 0.0281524453, -0.025440773, 0.0131647671, -0.035822086, -0.0175059903, 0.0604489222, 5.32358717e-05, 0.0549181849, 0.0242440

In [22]:
load_dotenv(override=True)

API_KEY = os.getenv("GROQ_API_KEY")
BASE_URL = "https://api.groq.com/openai/v1"

if not API_KEY:
    print("No API_Key found, Please set the API_KEY.")
    exit(1)
elif API_KEY.strip() != API_KEY:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [23]:
groq_client = OpenAI(api_key=API_KEY, base_url =BASE_URL)
MODEL = "llama3-70b-8192"

In [26]:
chat_history = []

model = SentenceTransformer("intfloat/e5-base-v2")

while True:
    query = input("You: ")
    if query.lower() == "exit":
        break
    
    query_vector = model.encode(f"query: {query}", normalize_embeddings=True).tolist()

    results = index.query(
        vector=query_vector,
        top_k=5,  # Get top 5 most relevant chunks
        include_metadata=True
    )

    context = ""
    for match in results["matches"]:
        context += match["metadata"]["text"].strip() + "\n---\n"

    
    # 🔁 Add memory to prompt
    history_string = ""
    for turn in chat_history[-3:]:  # Use last 3 Q&As (for brevity)
        history_string += f"User: {turn['question']}\nAssistant: {turn['answer']}\n"

    # Final prompt for the LLM
    prompt = f"""
    Use the following context to answer the question.

    Context:
    {context}

    Question: {query}
    Answer:
    """

    try:
        streamed_response = groq_client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant who answers clearly and concisely."},
                {"role": "user", "content": prompt}
            ],
            # stream=True   
        )
        answer = streamed_response.choices[0].message.content
        # return answer

        # Save to memory
        chat_history.append({"question": query, "answer": answer})
        
        print("\n🤖 Bot:", answer)

        # result = ""
        # for chunk in streamed_response:
        #     content_piece = chunk.choices[0].delta.content or ""
        #     result += content_piece
        #     cleaned_result = result.replace("```", "").replace("markdown", "")
        #     yield cleaned_result  # <- Streaming to Gradio

    except Exception as e:
        print(f"[LLM Error] {e}")



🤖 Bot: EverythingBegins With A Hello!
🤖 Bot: Based on the given context, the service offered by Microweb is:

* Hire a Bookkeeper Services
🤖 Bot: Based on the context, Microweb provides "Hire a Bookkeeper Services".
🤖 Bot: Based on the context, the services offered by this company are:

1. Organizational design
2. Change management
3. Culture transformation
4. Leadership development
5. Strategic planning
🤖 Bot: I'm happy to help! However, I notice that there is no specific question asked. The provided context appears to be a repetitive sequence of links and a input field with a message to leave it empty. Could you please clarify or provide an actual question related to this context? I'll do my best to assist you.
🤖 Bot: I'm happy to help! However, I notice that there is no specific question provided. The context appears to be a repeated pattern of website footer links and a input field with a placeholder text. Could you please clarify or provide the actual question you'd like me to ans