# **K12 Bot Experiment 01**

In [None]:
# install necessary libaries:


%pip install --upgrade --quiet sentence_transformers
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-google-genai langchain-chroma bs4 boto3
%pip install --upgrade --quiet langchain-aws pinecone-client
%pip install --upgrade --quiet langgraph langsmith langchain_anthropic
%pip install --upgrade --quiet sentence-transformers langchain_groq
%pip install --upgrade --quiet "pinecone[grpc]"

## **Load the All Secrects Keys:**

In [2]:
from google.colab import userdata
import os
import time

os.environ['GOOGLE_API_KEY'] = userdata.get('GEMINI_API_KEY')
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
os.environ['TAVILY_API_KEY'] = userdata.get('TAVILY_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['region'] = 'us-east-1'

## **Load Embeddings:**

In [None]:
# Get the Embeddings:

from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

In [None]:
len(embeddings.embed_query("Hello world"))

384

## **Create Pinecone Serverless Index:**

In [3]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

In [6]:
pc = Pinecone()

pc.has_index(name="k12test")

True

In [None]:
# Create Index:

pc = Pinecone()

pc.create_index(
  name="k12test",
  dimension=384,
  metric="cosine",
  spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  ),
  deletion_protection="disabled" # enabled means index never deleted, disabled means index can be deleted.
)

## **Data Ingestion:**

In [None]:
%pip install --upgrade --quiet unstructured[pdf] pypdfium2 pymongo

In [None]:
from langchain_community.document_loaders import PDFPlumberLoader, PyPDFium2Loader
from pathlib import Path
from pymongo import MongoClient
from pinecone.grpc import PineconeGRPC as Pinecone
from IPython.display import display, Markdown

### **Document Loaders:**

In [None]:
# Class Document Loader:
class DocumentLoader:
  def __init__(self):
    pass

  def load_pdfs(self, file_path):
    try:
      loader = PDFPlumberLoader(file_path)
      documents = loader.load()
      return documents, len(documents)

    except Exception as ex:
      raise ex

### **Data Preprocessing:**

In [None]:
# Class Data Preprocessing:
import re

class DataPreprocessing:
  def __init__(self):
    pass

  def clean_texts(self, text):
    try:
      # Remove newlines, tabs, and extra spaces
      text = re.sub(r'[\n\t\r]+', ' ', text)
      text = re.sub(r'\s+', ' ', text)
      return text.strip()

    except Exception as ex:
      raise ex


  def preprocessed(self, documents, board:str, language:str, cls:str, subject:str, subject_part:int, author:str="Admin", tags:list=None):
    try:
      processed_docs = []
      for doc in documents:
          # Clean the page content
          cleaned_content = self.clean_texts(text=doc.page_content)

          # Create Structured Documents:
          processed_doc = {
              "source": doc.metadata.get('source', ''),
              "file_path": doc.metadata.get('file_path', ''),
              "board": board,
              "language": language,
              "class": cls,
              "subject": subject,
              "subject_part": subject_part,
              "author": author,
              "tags": tags,
              "page": doc.metadata.get('page', 0),
              "total_pages": doc.metadata.get('total_pages', 0),
              "content": cleaned_content
          }

          processed_docs.append(processed_doc)

      return processed_docs

    except Exception as ex:
      raise ex


### **Store into MongoDB:**

In [None]:
# Store Data into MongoDB:

class DBOperation:
  def __init__(self):
    self.DB = "k12chatbot"
    self.MONGODB_URI = userdata.get('MONGODB_URI')

  def store_data(self, collection_name, data):
    try:
      client = MongoClient(self.MONGODB_URI)
      db = client[self.DB]

      collection = db[collection_name]
      collection.insert_many(data)
      client.close()

    except Exception as ex:
      raise ex

### **Vector Store:**

In [None]:
# Store Embeddings into Pinecone DB:

class VectorStore:
  def __init__(self):
    self.PINECONE_INDEX_NAME = "k12test"
    self.PC = Pinecone()


  def store_embeddings(self, docs, namespace, embeddings=embeddings):
    try:
      PINECONE_INDEX = self.PC.Index(self.PINECONE_INDEX_NAME)
      vector_list = []

      for i, doc in enumerate(docs):
        metadata = {
            "source": doc["source"] if doc["source"] else "None",
            "file_path": doc["file_path"] if doc["file_path"] else "None",
            "board": doc["board"] if doc["board"] else "None",
            "language": doc["language"] if doc["language"] else "None",
            "class": doc["class"] if doc["class"] else "None",
            "subject": doc["subject"] if doc["subject"] else "None",
            "subject_part": doc["subject_part"] if doc["subject_part"] else "None",
            "author": doc["author"] if doc["author"] else "None",
            "tags": doc["tags"] if doc["tags"] else "None",
            "page": doc["page"] if doc["page"] else "None",
            "total_pages": doc["total_pages"] if doc["total_pages"] else "None",
            "content": doc["content"] if doc["content"] else "None"
        }

        id_ = doc["board"] + "_" + doc["language"] + "_" + doc["class"] + "_" + doc["subject"] + "_" + str(doc["subject_part"]) + "_" + str(i)
        embedding = embeddings.embed_query(doc["content"])

        vc = {
            'id': id_,
            'values': embedding,
            'metadata': metadata
        }

        vector_list.append(vc)

      PINECONE_INDEX.upsert(vectors=vector_list, namespace=namespace)

    except Exception as ex:
      raise ex

### **Main: Data Ingestion:**

In [None]:
class DataIngestion:
  def __init__(self):
    pass

  def data_ingestion(self, file_path, board, language, cls, subject, subject_part, author, tags):
    try:
      # Start Time:
      start_time = time.time()

      # Documet Loader:
      document_loader = DocumentLoader()
      documents, total_pages = document_loader.load_pdfs(file_path=file_path)
      print("Document Loading Complete")


      # Data Preprocessing:
      data_preprocessing = DataPreprocessing()
      processed_docs = data_preprocessing.preprocessed(documents=documents, board=board, language=language,
                                                       cls=cls, subject=subject, subject_part=subject_part,
                                                       author=author, tags=tags)
      print("Data Preprocessing Complete")


      # Store Data into MongoDB:
      db_operation = DBOperation()
      collection_name = cls
      db_operation.store_data(collection_name=collection_name, data=processed_docs)
      print("Data Stored into MongoDB")


      # Store the Pre-processed documents into Vector Database:
      vector_store = VectorStore()
      namespace = cls
      vector_store.store_embeddings(docs=processed_docs, namespace=namespace)
      print("Data Stored into Vector Database")


      # End Time:
      end_time = time.time()
      execution_time = end_time - start_time
      print(f"Execution time: {execution_time:.2f} seconds")

    except Exception as ex:
      raise ex

#### **Start the Process:**

In [None]:
data_ingestion = DataIngestion()

In [None]:
data_ingestion.data_ingestion(file_path="/content/K10/k10_science.pdf",
                                                  board="CBSE",
                                                  language="English",
                                                  cls="K10",
                                                  subject="Science",
                                                  subject_part=1,
                                                  author="Admin",
                                                  tags=None)

# k10_science pending.