In [None]:
# 📁 Standard libraries
import os
import re
import json
import socket
import ipaddress

# 🧪 Third-party libraries
import numpy as np
from dotenv import load_dotenv 

# 🧠 AI/NLP
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 🔍 Vector DB
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm


from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

from langchain_groq import ChatGroq


In [2]:
load_dotenv(override=True)

API_KEY = os.getenv("GROQ_API_KEY")
BASE_URL = os.getenv("GROQ_BASE_URL")

client = OpenAI(api_key=API_KEY, base_url =BASE_URL)
MODEL = "llama3-70b-8192"

if not API_KEY:
    print("No API_Key found, Please set the API_KEY.")
    exit(1)
elif API_KEY.strip() != API_KEY:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [3]:

load_dotenv(override=True)
# if not load_dotenv():
#     print("Warning: .env file not loaded. Make sure it exists.")

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# Replace with your real values
PINECONE_API_KEY = PINECONE_API_KEY

if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY environment variable not set")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [4]:

# Directory where your scraped text files are saved
data_dir = "scraped_pages"

# This will store all your documents
documents = []

# Loop through all text files in that folder
for filename in os.listdir(data_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(data_dir, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            raw_text = f.read().strip()
            
            # Split by lines
            lines = raw_text.split("\n")
            
            # Extract URL from the first line
            url = None
            if lines and lines[0].startswith("URL:"):
                url = lines[0].replace("URL:", "").strip()
                content = "\n".join(lines[1:]).strip()
            else:
                content = raw_text  # fallback if URL line not found
            
            # Save as structured dict
            documents.append({
                "filename": filename,
                "url": url,
                "content": content
            })

In [5]:
# documents

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

chunks = []

for doc in documents:
    split_chunks = splitter.split_text(doc["content"])
    for chunk in split_chunks:
        chunks.append({
            "chunk": chunk,
            "source": doc["filename"],
            "url": doc["url"]
        })

In [7]:
len(chunks)

7733

In [8]:

# ✅ Load the model
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# ✅ Prepare text with "passage: " prefix (required by E5)
texts_to_embed = [f"passage: {chunk['chunk']}" for chunk in chunks]

# ✅ Batch encode all 7,339 chunks at once
vectors = embedding_model.encode(
    texts_to_embed,
    batch_size=64,                 # ⚡ Increase if you have more RAM/GPU
    show_progress_bar=True,
    normalize_embeddings=True      # ✅ Normalize for cosine similarity
)

# ✅ Attach embeddings to your chunks
for i, vec in enumerate(vectors):
    chunks[i]["embedding"] = vec.tolist()  # Optional: convert to list if saving to JSON


Batches:   0%|          | 0/121 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:

# Initialize client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Configuration
index_name = "chatbot-index"
dimension = 768  # for E5-base-v2
metric = "cosine"

# ✅ Check if index exists, else create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(
            cloud="aws",             # Based on Pinecone environment info
            region="us-east-1"       # your region from Pinecone Console
        )
    )

# ✅ Connect to index
index = pc.Index(index_name)


In [None]:

batch_size = 100  # Upsert in batches (efficient & safe)
batch = []

for i, chunk in enumerate(tqdm(chunks)):
    vector_id = f"chunk-{i}"  # Unique ID for Pinecone

    vector = {
        "id": vector_id,
        "values": chunk["embedding"],  # The 768-dimensional vector from E5
        "metadata": {
            "text": chunk["chunk"],     # Original text chunk
            "source": chunk["source"],  # File or page name
            "url": chunk["url"]         # Optional if you have it
        }
    }

    batch.append(vector)

    # ⬆️ Upload in batches
    if len(batch) == batch_size or i == len(chunks) - 1:
        index.upsert(vectors=batch)
        batch = []  # Reset for next batch


  9%|▉         | 699/7733 [00:09<01:30, 77.45it/s]


KeyboardInterrupt: 

In [None]:
# Check index statistics
stats = index.describe_index_stats()
print(stats)


{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 7600}},
 'total_vector_count': 7600,
 'vector_type': 'dense'}


In [None]:
# Example: fetch first 3 vectors
ids_to_fetch = [f"chunk-{i}" for i in range(3)]

fetched = index.fetch(ids=ids_to_fetch)
print(fetched)


FetchResponse(namespace='', vectors={'chunk-1': Vector(id='chunk-1', values=[0.0690192357, -0.0522028357, -0.0558619685, -0.0108925952, 0.00571820186, 0.0129024163, 0.0935651287, 0.00927436724, 0.00907278527, -0.0425153635, 0.0623959862, 0.0183351319, 0.0384162441, 0.0992040858, 0.0649987, -0.0397860706, 0.0496518835, 0.00248506013, -0.0583570376, 0.0524932332, -0.00204072683, 0.0224172343, -0.00616179919, 0.0306997336, 0.0132681383, -0.035977453, 0.0447034128, -0.00200463249, 0.0126982527, -0.0428372547, 0.0303309374, -0.0539201312, -0.0118305786, -0.0378958, 2.17446109e-06, 0.00253964891, -0.0290798191, -0.00328154466, -0.020537002, -0.041252844, 0.063085027, 0.0440394953, -0.00146796741, 0.0590871908, 0.00810413808, -0.0564720519, 0.0383002944, 0.0452551618, -0.0238774922, 0.0251167528, 0.00609636214, -0.00948486291, -0.0408278294, 0.00341522531, -0.0738738105, 0.0588431656, -0.0119770756, 0.000610164891, 0.0543838255, 0.0346503854, 0.0208726507, -0.00741212582, -0.00522405049, 0.01

In [None]:

# Initialize the embedding function
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# Connect LangChain to the Pinecone index
vector_store = LangchainPinecone(
    index_name="chatbot-index",
    embedding_function=embeddings,
    pinecone_api_key=PINECONE_API_KEY
)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})  # k: number of docs returned
