In [5]:
# Install requirements
!pip install "langchain>=0.2" "langchain-astradb>=0.4" \
    "langchain-openai>=0.1" "datasets>=3.0" "pypdf>=5.0" \
    "python-dotenv>=1.0"
!pip install unstructured
!pip install python-dotenv



In [6]:
from datasets import load_dataset

import os
import requests
from bs4 import BeautifulSoup
from getpass import getpass

from astrapy.db import AstraDB
import openai

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import AstraDB as AstraVectorStore
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

from langchain_astradb import AstraDBVectorStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings


In [7]:
import os
from dotenv import load_dotenv
from google.colab import drive

drive.mount('/content/drive')


env_file = "/content/drive/MyDrive/config/astra_config.env"

load_dotenv(env_file)

ASTRA_DB_APPLICATION_TOKEN = os.environ.get("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = os.environ.get("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")



Mounted at /content/drive


In [8]:
# initiate astra db client
astra_db = AstraDB(
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
)

In [9]:
# create a collection
coll_name = "k8s_coll"
collection = astra_db.create_collection(coll_name, dimension=1536)

In [10]:
# scrape the documentation website
base_url = "https://kubernetes.io/docs/home/"
response = requests.get(base_url)
if response.status_code != 200:
    raise Exception("Failed to retrieve the base documentation page.")

soup = BeautifulSoup(response.text, "html.parser")

In [11]:
# This is a simple approach: find all <a> tags and keep those that lead to docs pages.
# You may need to refine this logic depending on the site's structure.
links = []
for a_tag in soup.find_all("a", href=True):
    href = a_tag['href']
    # Normalize and filter links
    # If href is relative, construct absolute URL
    if href.startswith("/"):
        full_url = base_url.rstrip("/") + href
    elif href.startswith(base_url):
        full_url = href
    else:
        # Skip external links
        continue

    # Basic filtering: skip fragments and duplicates
    if "#" in full_url:
        full_url = full_url.split("#")[0]

    if full_url not in links and full_url.startswith(base_url):
        links.append(full_url)

    print(full_url)
#links = ['http://docs.cast.ai']
print(f"Found {len(links)} potential links from the main page.")
print(links)

https://kubernetes.io/docs/home/
https://kubernetes.io/docs/home/docs/
https://kubernetes.io/docs/home/blog/
https://kubernetes.io/docs/home/training/
https://kubernetes.io/docs/home/partners/
https://kubernetes.io/docs/home/community/
https://kubernetes.io/docs/home/case-studies/
https://kubernetes.io/docs/home/releases
https://kubernetes.io/docs/home/
https://kubernetes.io/docs/home/bn/docs/home/
https://kubernetes.io/docs/home/zh-cn/docs/home/
https://kubernetes.io/docs/home/fr/docs/home/
https://kubernetes.io/docs/home/de/docs/home/
https://kubernetes.io/docs/home/hi/docs/home/
https://kubernetes.io/docs/home/id/docs/home/
https://kubernetes.io/docs/home/it/docs/home/
https://kubernetes.io/docs/home/ja/docs/home/
https://kubernetes.io/docs/home/ko/docs/home/
https://kubernetes.io/docs/home/pl/docs/home/
https://kubernetes.io/docs/home/pt-br/docs/home/
https://kubernetes.io/docs/home/ru/docs/home/
https://kubernetes.io/docs/home/es/docs/home/
https://kubernetes.io/docs/home/uk/docs/

In [12]:
# ====== Load Documents from Extracted URLs ======
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from langchain.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(urls=links)
docs = loader.load()

print(f"Number of documents loaded: {len(docs)}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Number of documents loaded: 794


In [13]:
# split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs_split = text_splitter.split_documents(docs)
print(f"Number of chunks: {len(docs_split)}")

Number of chunks: 796


In [17]:
embedding = OpenAIEmbeddings()

vectorstore = AstraVectorStore.from_documents(
    documents=docs_split,
    embedding=embedding,
    token=ASTRA_DB_APPLICATION_TOKEN,
    astra_client=astra_db,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    keyspace=ASTRA_DB_KEYSPACE,
    collection_name=coll_name,
)

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)







In [18]:
# define a safe query function
def ask_question(query: str):
    response = qa_chain({"query": query})
    if not response["source_documents"]:
        return "answer not found"
    if not response["result"].strip():
        return "answer not found"
    return response["result"]

In [20]:
# example usage
user_query = input("What would you like to ask about CAST AI?")
answer = qa_chain.invoke({"query": user_query})

print(answer['result'])


What would you like to ask about CAST AI?how to configure node affinity
To configure node affinity in Kubernetes, you can use the following steps:

1. Define a node selector for your pod by adding the `nodeSelector` field to your pod specification. This field specifies a map of key-value pairs that must match the labels on a node for the pod to be scheduled on that node.

2. Use the `kubectl label nodes` command to add labels to your nodes. For example, you can label a node with `kubectl label nodes <node-name> <label-key>=<label-value>`.

3. Update your pod specification to include the node selector that matches the labels you set on your nodes. For example:
   ```yaml
   spec:
     nodeSelector:
       <label-key>: <label-value>
   ```

4. Apply the updated pod specification using `kubectl apply -f <pod-spec-file.yaml>`.

By following these steps, you can configure node affinity for your pods in Kubernetes.
