### Dependencies

- langchain_google_genai
- langchain
- langchain_core
- time
- dotenv
- pprint
- datasets
- typing_extensions
- typing
- IPython
- ragas
- langgraph
- tiktoken
- re
- PyPDF2
- pylcs
- pandas
- textwrap
- markdown

In [None]:
# INSTALL DEPENDENCIES

!pip install langchain_google_genai langchain langchain_core python-dotenv datasets typing_extensions IPython ragas langgraph tiktoken PyPDF2 pylcs pandas langchain_groq langchain_mongodb markdown

In [15]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_groq import ChatGroq
from langchain.document_loaders import PyPDFLoader
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputKeyToolsParser
from langchain_core.runnables.graph import MermaidDrawMethod

from langgraph.graph import END, StateGraph

from dotenv import load_dotenv
from pprint import pprint
import os
from datasets import Dataset
from typing_extensions import TypedDict
from IPython.display import display, Image
from typing import List, TypedDict

from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    faithfulness,
    answer_relevancy,
    context_recall,
    answer_similarity
)

import langgraph

### Helper functions for notebook
from helper_functions import num_tokens_from_string, replace_t_with_space, replace_double_lines_with_one_line, split_into_chapters,\
analyse_metric_results, escape_quotes, text_wrap,extract_book_quotes_as_documents

load_dotenv(override=True)

True

### Setting GEMINI and GROQ API keys

In [None]:
gemini_api_key = os.getenv("GEMINI_API_KEY")
os.environ["GEMINI_API_KEY"] = gemini_api_key
groq_api_key = os.getenv("GROQ_API_KEY")

## Data preprocessing

### define path

In [17]:
pdf_path = "Computer_Network_Chapter_3.pdf"

### pdf 

In [32]:
import PyPDF2
temp = open(pdf_path, "rb")
pdf_reader = PyPDF2.PdfReader(temp)
documents = pdf_reader.pages  # Get all pages from the PDF
# Concatenate text from all pages
text = " ".join([doc.extract_text() for doc in documents])

In [33]:
text



In [42]:
from my_helper_function import clean_presentation_text

In [None]:
cleaned_text = clean_presentation_text(text)
print(cleaned_text)

### md

In [None]:
md_path = "Computer_Network_Chapter_3.md"

import markdown
    

### chunking

In [44]:
# --- 4. Chunking with LangChain ---
# Initialize the text splitter
# Adjust chunk_size and chunk_overlap based on your RAG needs and the LLM's context window
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Max characters per chunk
    chunk_overlap=150, # Characters to overlap between chunks
    length_function=len,
    is_separator_regex=False,
    separators=["\n\n", "\n", ". ", ", ", " ", ""] # Order of separators to try
)

# Split the cleaned text into documents (chunks)
# In LangChain, text is often wrapped in a 'Document' object, 
# but for this example, we'll work directly with the text strings.
chunks = text_splitter.split_text(cleaned_text)

In [None]:
# --- 5. Condensing Chunks with Structured Output ---

# Define the desired structured output format using Pydantic
class ChunkCondensation(BaseModel):
    """Defines the desired structure for condensed information from a text chunk."""
    summary: str = Field(description="A concise summary (2-3 sentences) of the main topic discussed in the chunk.")
    keywords: List[str] = Field(description="A list of 3-5 main keywords or terms mentioned.")
    key_concepts: List[str] = Field(description="A list of key concepts, protocols, or definitions explained in the chunk.")
    # You could add more fields, e.g., potential_questions: List[str]

# Set up the PydanticOutputParser
# This parser will automatically generate formatting instructions for the LLM
# and parse the LLM's JSON output back into the Pydantic object.
output_parser = PydanticOutputParser(pydantic_object=ChunkCondensation)

# Define the Prompt Template
# This template includes the original text chunk and the formatting instructions
# generated by the output_parser.
prompt_template = PromptTemplate(
    template="Analyze the following text chunk about the Data Link Layer. Extract the requested information.\n{format_instructions}\nChunk Text:\n{chunk}\n",
    input_variables=["chunk"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

# Initialize the LLM (e.g., Gemini via LangChain) - Requires API Key
# Make sure to replace "YOUR_API_KEY" with your actual key if running this
# llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key="YOUR_API_KEY", convert_system_message_to_human=True) 

# --- Process Chunks ---
condensed_data_list = []
for i, chunk in enumerate(chunks):
    
    # --- This is where you would call the LLM with the structured prompt ---
    
    # 1. Format the prompt with the current chunk and parser instructions
    # formatted_prompt = prompt_template.format_prompt(chunk=chunk)

    # 2. Invoke the LLM (Requires API Key and uncommenting)
    # response = llm.invoke(formatted_prompt.to_string()) # Get LLM response as string

    # 3. Parse the LLM response using the Pydantic parser
    # try:
    #    parsed_output = output_parser.parse(response)
    #    condensed_data_list.append(parsed_output.dict()) # Store the structured data as a dictionary
    # except Exception as e:
    #    print(f"Error parsing LLM output for chunk {i+1}: {e}")
    #    # Handle error, maybe store the raw chunk or a default structure
    #    condensed_data_list.append({
    #        "summary": f"Error processing chunk {i+1}", 
    #        "keywords": [], 
    #        "key_concepts": [],
    #        "raw_chunk": chunk # Optionally keep raw chunk on error
    #    })
        
    # --- Placeholder for demonstration (Simulating LLM Output) ---
    # To make the script runnable without an API key, we'll create placeholder structured data.
    # In a real implementation, you would remove this placeholder block 
    # and use the actual LLM call and parsing above.
    placeholder_data = {
        "summary": f"This is a placeholder summary for chunk {i+1}. It would normally contain a concise overview of the chunk's content.",
        "keywords": [f"keyword{j}" for j in range(1, 4)], # Placeholder keywords
        "key_concepts": [f"concept{j}" for j in range(1, 3)] # Placeholder concepts
    }
    condensed_data_list.append(placeholder_data)
    # --- End of Placeholder ---

    # Optional: Print progress
    # print(f"Processed chunk {i+1}/{len(chunks)}")


# --- 6. Output Example ---
# Print the first few structured condensation results
print("--- Example Cleaned and Condensed Structured Output (First 3 Chunks) ---")
for i in range(min(3, len(condensed_data_list))): 
    print(f"--- Condensed Data for Chunk {i+1} ---")
    # Print the structured data (dictionary format)
    # In a real scenario, this would contain the data extracted by the LLM
    print(json.dumps(condensed_data_list[i], indent=2)) 
    print("-" * 20)

# You can now use the 'condensed_data_list' which contains structured dictionaries
# for each chunk, suitable for indexing in a vector database or other RAG components.
# For example, you might index the 'summary' or 'key_concepts' along with the original chunk.

# Example: Saving structured data to a JSON file
# with open("condensed_datalink_data.json", "w", encoding="utf-8") as f:
#     json.dump(condensed_data_list, f, indent=2, ensure_ascii=False)



In [31]:
information_condenser = ChatGoogleGenerativeAI(
    model = "gemini-2.5-pro-exp-03-25",
    temperature = 0,
    max_tokens=None,
    timeout=None
)

