In [4]:
import os
import getpass # For prompting if needed
# This import is characteristic of the newer google-genai SDK
from google import genai

print("Attempting to configure Gemini API Key with google.genai SDK...")

try:
    # Try to get the API key from the correct environment variable
    api_key = os.getenv("GEMINI_API_KEY") # Changed to GEMINI_API_KEY

    if not api_key:
        print("GEMINI_API_KEY environment variable not found.")
        print("Attempting to prompt for API key...")
        api_key = getpass.getpass('Enter your Gemini API Key: ')

    if api_key:
        # Initialize the client with the API key
        # This is the key change for the newer SDK
        client = genai.Client(api_key=api_key)
        print("Gemini API Client initialized successfully.")

        # --- Optional: List models to verify ---
        print("\nAvailable models (some might be specific to this client/region):")
        try:
            # Listing models might be slightly different or might not be needed for basic generation
            # Often, you just specify the model name directly in generate_content
            # This is a general way; specific model listing might vary
            # For this SDK, you often just try to use a model name directly.
            # Let's try a direct generation to test.
            pass # Skipping model listing for now, focusing on generation
        except Exception as e_list:
            print(f"Could not list models (this step is optional): {e_list}")


        # --- Example: Generate Content ---
        print("\nAttempting to generate content...")
        # Model names might be like "gemini-1.5-flash-001" or "gemini-1.5-pro-001"
        # The newer SDK might use slightly different model identifiers or access them via client.models
        # Let's assume a common model, adjust if needed based on documentation for google.genai
        model_name = "gemini-2.0-flash" # Or "gemini-1.5-flash-latest", "models/gemini-1.5-flash-latest"
                                      # Check Google AI Studio or docs for exact names compatible with google.genai

        # The way to call the model might also change.
        # For google.genai, it's often:
        # response = client.generate_content(model=f"models/{model_name}", contents="Hello Gemini!")
        # Or interacting with a specific model object from the client:
        
        # Let's try a common pattern seen with genai.Client
        # Note: The exact model invocation can vary. Refer to the latest `google-genai` docs if this fails.
        # For example, it might be client.get_generative_model(model_name='gemini-1.5-flash') and then model.generate_content()
        # Or directly:
        
        prompt = "What is the capital of California?"
        # The structure for `contents` can be just a string for simple text prompts.
        response = client.generate_content(
            model=f"models/{model_name}", # Or sometimes just model_name
            contents=prompt
        )
        
        # Accessing the text might also be slightly different.
        # Often it's response.text, but could be response.candidates[0].content.parts[0].text
        # Let's assume response.text for simplicity from common examples.
        
        if hasattr(response, 'text') and response.text:
            print("Response from Gemini:")
            print(response.text)
        elif response.candidates:
             # More robust way to get text if response.text isn't directly available
            try:
                text_output = response.candidates[0].content.parts[0].text
                print("Response from Gemini:")
                print(text_output)
            except (IndexError, AttributeError) as e_text:
                print(f"Could not extract text from response.candidates: {e_text}")
                print("Full response object:", response)
        else:
            print("Received a response, but could not extract text directly. Full response object:")
            print(response)

    else:
        print("No API key provided. Configuration failed.")

except AttributeError as e_attr:
    if "'google.genai' has no attribute 'configure'" in str(e_attr):
        print(f"AttributeError: {e_attr}")
        print("This confirms you're likely using the newer 'google.genai' SDK which doesn't use 'configure'.")
        print("The script attempts to use genai.Client(api_key=...) instead. If this error persists, check import statements and SDK documentation.")
    else:
        print(f"An AttributeError occurred: {e_attr}")
except ImportError as e_imp:
    print(f"ImportError: {e_imp}. Please ensure you have the correct Google AI SDK installed.")
    print("You might need to run: !pip install -q -U google-genai")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    if "API_KEY_INVALID" in str(e).upper() or "PERMISSION_DENIED" in str(e).upper():
        print("There might be an issue with your API key's validity or permissions. Please check it in Google AI Studio.")
    elif "found no model" in str(e).lower() or "could not find model" in str(e).lower():
        print(f"The model name '{model_name}' might be incorrect or not accessible with your key/region. Please check available models.")

Attempting to configure Gemini API Key with google.genai SDK...
Gemini API Client initialized successfully.

Available models (some might be specific to this client/region):

Attempting to generate content...
An AttributeError occurred: 'Client' object has no attribute 'generate_content'


In [2]:
import os

api_key_from_env = os.getenv("GEMINI_API_KEY")

if api_key_from_env:
    print("SUCCESS: Environment variable GOOGLE_API_KEY is found by the kernel.")
    # print(f"Key: {api_key_from_env[:5]}...{api_key_from_env[-5:]}") # Uncomment to see a snippet (be careful)
else:
    print("FAILURE: Environment variable GOOGLE_API_KEY is NOT found by the kernel.")
    print("\nConsider the troubleshooting steps below.")


SUCCESS: Environment variable GOOGLE_API_KEY is found by the kernel.


In [7]:
import os
import getpass
from google import genai # Using the newer google-genai SDK
from IPython.display import display, Markdown

# --- 1. Initialize the Gemini API Client ---
try:
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        print("GEMINI_API_KEY environment variable not found. Please enter it below.")
        api_key = getpass.getpass('Enter your Gemini API Key: ')

    if not api_key:
        raise ValueError("API Key not provided. Cannot proceed.")

    client = genai.Client(api_key=api_key)
    print("Gemini API Client initialized successfully.")

except Exception as e:
    print(f"Error initializing Gemini Client: {e}")
    client = None # Ensure client is None if initialization fails

# --- 2. Query the Model (if client was initialized) ---
if client:
    try:
        # --- DEFINE YOUR QUERY HERE ---
        your_prompt = "Explain the concept of quantum entanglement in simple terms."
        # You can change the prompt above to ask any question you like!

        # --- CHOOSE YOUR MODEL ---
        # Common models: "models/gemini-1.5-flash-latest", "models/gemini-1.5-pro-latest"
        # Using "latest" usually points to the most recent stable version of that model.
        # As of May 2025, gemini-1.5-flash is great for speed and many tasks,
        # while gemini-1.5-pro is more powerful for complex reasoning.
        model_to_use = "models/gemini-2.0-flash"

        print(f"\nSending prompt to {model_to_use}: '{your_prompt}'")

        # --- SEND THE QUERY (GENERATE CONTENT) ---
        response = client.models.generate_content(
            model=model_to_use,
            contents=your_prompt
            # You can add more parameters here if needed, e.g., generation_config
        )

        # --- DISPLAY THE RESPONSE ---
        print("\nResponse from Gemini:")
        
        # Extracting text from the response.
        # The structure can sometimes vary, so this tries a common way.
        generated_text = ""
        if response.candidates and response.candidates[0].content.parts:
            generated_text = response.candidates[0].content.parts[0].text
        elif hasattr(response, 'text') and response.text: # Fallback for simpler text responses
            generated_text = response.text
        else:
            generated_text = "Could not extract text from the response object in the expected format."
            print("Full response object for debugging:")
            print(response)

        # Using IPython.display.Markdown for potentially rich text output
        display(Markdown(generated_text))

    except Exception as e:
        print(f"An error occurred while querying Gemini: {e}")
        if "API_KEY_INVALID" in str(e).upper() or "PERMISSION_DENIED" in str(e).upper():
            print("There might be an issue with your API key's validity or permissions. Please check it in Google AI Studio.")
        elif "could not find model" in str(e).lower() or "found no model" in str(e).lower() or "404" in str(e):
            print(f"The model name '{model_to_use}' might be incorrect, not accessible with your API key/region, or the endpoint is wrong. Please verify the model name.")
        # Add more specific error handling as needed

Gemini API Client initialized successfully.

Sending prompt to models/gemini-2.0-flash: 'Explain the concept of quantum entanglement in simple terms.'

Response from Gemini:


Imagine you have two coins, but they're special "quantum coins." You put each coin in a separate box and send one box to your friend Alice and keep the other yourself.

Here's the weird part:

*   **Before you open your box, the coin isn't actually heads or tails yet.** It's in a blurry state of both possibilities. Think of it like a spinning coin in the air.

*   **These coins are "entangled."** This means that their fates are linked. The moment you open your box and see, say, "heads," you *instantly* know that Alice's coin will be "tails," even though she's far away and hasn't opened her box yet.

*   **Instant Connection, No Signal:** This happens *instantly*, faster than any signal could travel between you and Alice. That's the spooky part. It's as if the coins were communicating without any actual communication.

**So, in essence, quantum entanglement is:**

*   Two or more particles (like our coins) are linked together in a special way.
*   Their properties are correlated, meaning that measuring one instantly tells you something about the other, no matter how far apart they are.
*   It's not that the properties were predetermined from the start. It's the act of measurement on one that forces the other to take on a specific, correlated value *instantly*.

**Important things to remember:**

*   **It's not a way to send information faster than light.** You can't control what result you get when you open your box, so you can't send a message to Alice.
*   **It's a fundamental property of quantum mechanics.**  It's not just a theoretical idea; it's been proven in many experiments.

**Think of it like this:  It's as if you bought two gloves, one left and one right, and put each in a separate box. You send one box to Alice. Before she opens her box, you don't know which glove she has.  But the moment you open your box and see it's the left glove, you know instantly, without her telling you, that she has the right glove. That's entanglement.**

While the coin analogy is helpful, it's crucial to understand that quantum entanglement is a purely quantum phenomenon and goes beyond our everyday experiences.


In [9]:
import google.generativeai as genai # Assuming this is your import
from IPython.display import display, Markdown

# Your API key should already be configured from previous steps
# (e.g., via environment variable GEMINI_API_KEY)

# --- Define your system prompt and model ---
your_system_prompt = "You are a pirate captain with a short temper."
model_name_for_system_prompt = "gemini-1.5-flash-latest" # Or your preferred model

try:
    # --- 1. Create a GenerativeModel instance with the system instruction ---
    model_with_system_prompt = genai.GenerativeModel(
        model_name=model_name_for_system_prompt,
        system_instruction=your_system_prompt
    )
    print(f"GenerativeModel '{model_name_for_system_prompt}' initialized with system prompt.")

    # --- 2. Your user query ---
    user_query = "Can you explain what a neural network is?"

    print(f"\nSending user query: '{user_query}'")

    # --- 3. Generate content using this model instance ---
    response = model_with_system_prompt.generate_content(user_query)

    # --- 4. Display the response ---
    generated_text = response.candidates[0].content.parts[0].text if response.candidates else "No response text found."
    display(Markdown(generated_text))

except Exception as e:
    print(f"An error occurred: {e}")
    # Common errors: API key not configured, model name incorrect, quota issues.

GenerativeModel 'gemini-1.5-flash-latest' initialized with system prompt.

Sending user query: 'Can you explain what a neural network is?'


Avast ye, landlubber!  Neural network?  Yer talkin' about somethin' complicated, somethin' that'd make a kraken blush.  It's like... a bunch o' interconnected brain cells,  but made o' math instead o' goo.  Each little cell, or "neuron," takes in some information, does a bit o' number-crunching, then spits out an answer.  These answers get passed on to other neurons, and so on, until ye get a final result.

Think o' it like this:  ye got a treasure map.  Each neuron is a pirate lookin' at a piece o' the map. One pirate sees a skull, another sees a crossbones, another sees a buried X. They all shout their findings to the next pirate. Eventually, one savvy buccaneer figures out where the treasure is.  That's the final answer from the neural network!

Now, don't be lookin' at me with yer squinty eyes.  It's more complicated than that, but if ye need more detail, I'll just have to make ye walk the plank!  Argh!


In [10]:
import os
from PyPDF2 import PdfReader # Using PyPDF2
# If you installed and prefer pymupdf:
# import fitz # pymupdf is imported as fitz

def extract_text_from_txt(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        print(f"Error reading TXT file {filepath}: {e}")
        return ""

def extract_text_from_md(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            # For RAG, treating Markdown as plain text is often a good start.
            # More advanced parsing could extract structure, but adds complexity.
            return f.read()
    except Exception as e:
        print(f"Error reading MD file {filepath}: {e}")
        return ""

def extract_text_from_pdf_pypdf2(filepath):
    text = ""
    try:
        with open(filepath, "rb") as f:
            reader = PdfReader(f)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n" # Add a newline between pages
    except Exception as e:
        print(f"Error reading PDF (PyPDF2) {filepath}: {e}")
    return text

# def extract_text_from_pdf_pymupdf(filepath):
#     text = ""
#     try:
#         with fitz.open(filepath) as doc:
#             for page in doc:
#                 text += page.get_text() + "\n"
#     except Exception as e:
#         print(f"Error reading PDF (pymupdf) {filepath}: {e}")
#     return text

# --- USER: Specify the directory containing your building code documents ---
data_directory = "Building/DATA/ALLMARKDOWN" # <<< CHANGE THIS to your folder name
# ---

# Create the directory if it doesn't exist (for users to add files)
os.makedirs(data_directory, exist_ok=True)
print(f"Please ensure your .md, .txt, and .pdf building code documents are in: {os.path.abspath(data_directory)}")

# For demonstration, if the directory is empty, create a few dummy files.
# In your actual use, REMOVE or COMMENT OUT this dummy file creation block.
if not any(f for f in os.listdir(data_directory) if f.lower().endswith(('.txt', '.md', '.pdf'))):
    print(f"\nDirectory '{data_directory}' appears empty or has no supported files. Creating dummy files for demonstration.")
    with open(os.path.join(data_directory, "chapter1_foundations.txt"), "w") as f:
        f.write("Section 1.1: All buildings must have a solid foundation. Section 1.2: Foundations must be appropriate for soil conditions.")
    with open(os.path.join(data_directory, "electrical_codes.md"), "w") as f:
        f.write("# Chapter 5: Electrical Systems\n\nAll wiring must comply with NEC standards. Minimum wire gauge for residential circuits is 14 AWG for 15-amp circuits.")
    # Note: We can't easily create a meaningful dummy PDF here. Please add your own PDFs.
    print("Dummy .txt and .md files created. Please add your own PDFs to the directory for full testing.")
else:
    print(f"\nFound files in '{data_directory}'. Proceeding with loading.")


raw_document_texts = [] # Will store the full text content of each document
document_metadata_list = []   # Will store metadata like filename for each document

print(f"\nScanning and loading documents from: {data_directory}")
for filename in os.listdir(data_directory):
    filepath = os.path.join(data_directory, filename)
    file_text = ""
    doc_type = None

    if filename.lower().endswith(".txt"):
        file_text = extract_text_from_txt(filepath)
        doc_type = "txt"
    elif filename.lower().endswith(".md"):
        file_text = extract_text_from_md(filepath)
        doc_type = "md"
    elif filename.lower().endswith(".pdf"):
        file_text = extract_text_from_pdf_pypdf2(filepath) # Using PyPDF2 by default
        # file_text = extract_text_from_pdf_pymupdf(filepath) # Uncomment if using pymupdf
        doc_type = "pdf"

    if file_text.strip(): # Only add if text was actually extracted
        raw_document_texts.append(file_text)
        document_metadata_list.append({"filename": filename, "type": doc_type})
        print(f"  Loaded: {filename} ({doc_type}) - {len(file_text)} characters")
    elif doc_type: # If it was a supported filetype but no text extracted
        print(f"  Warning: No text extracted from {filename} ({doc_type})")


print(f"\nSuccessfully loaded content from {len(raw_document_texts)} documents.")
if raw_document_texts:
    print("\nSnippet of the first loaded document's content:")
    print(f"Source: {document_metadata_list[0]['filename']}")
    print(raw_document_texts[0][:300] + "..." if len(raw_document_texts[0]) > 300 else raw_document_texts[0])

Please ensure your .md, .txt, and .pdf building code documents are in: /home/mack/notebooks/Building/DATA/ALLMARKDOWN

Found files in 'Building/DATA/ALLMARKDOWN'. Proceeding with loading.

Scanning and loading documents from: Building/DATA/ALLMARKDOWN
  Loaded: 2022 Title 24 Fire.pdf.md (md) - 3438672 characters
  Loaded: 2021 International Building Code® (International Code Council (ICC)) (Z-Library).md (md) - 3337039 characters
  Loaded: 2022 Title 24 Electrical.pdf.md (md) - 4447702 characters
  Loaded: 2022 Title 24 Green Building Standards.pdf.md (md) - 651605 characters
  Loaded: 2022 Title 24 Existing.pdf.md (md) - 1804850 characters
  Loaded: 2022 Title 24 Energy.pdf.md (md) - 1254540 characters
  Loaded: 2022 Title 24 Administrative.pdf.md (md) - 1335851 characters

Successfully loaded content from 7 documents.

Snippet of the first loaded document's content:
Source: 2022 Title 24 Fire.pdf.md
IMPORTANT NOTICE
Act now to keep your code up-to-date.
The purchase of this code incl

In [11]:
# You might want to install LangChain for more sophisticated splitters,
# but here's a simple character-based splitter.
# !pip install -q langchain-text-splitters (if you want to use LangChain's splitters)

def simple_char_text_splitter(text, chunk_size=1000, chunk_overlap=150):
    """Splits text into chunks of roughly 'chunk_size' with 'chunk_overlap'."""
    if not text:
        return []
    
    chunks = []
    start_index = 0
    while start_index < len(text):
        end_index = start_index + chunk_size
        chunks.append(text[start_index:end_index])
        start_index += (chunk_size - chunk_overlap)
        if start_index >= len(text) and chunks[-1] != text[start_index - (chunk_size - chunk_overlap):]: # Avoid infinite loop on tiny last part
             # Ensure last chunk is added if it's shorter than overlap
            if len(text) > (start_index - (chunk_size - chunk_overlap)):
                 last_chunk_start = start_index - (chunk_size - chunk_overlap)
                 if text[last_chunk_start:] not in chunks : # avoid duplicate last chunk
                    chunks.append(text[last_chunk_start:])
            break
            
    # A slightly better overlap handling for the very last chunk
    if chunks and len(text) > chunk_size :
        if len(chunks[-1]) < chunk_overlap and len(chunks) > 1 :
            # If the last chunk is too small (less than overlap) and there's a previous chunk,
            # it might be better to merge it or handle it differently.
            # For simplicity, this basic splitter might create very small trailing chunks.
            # print(f"Warning: Last chunk is small (len: {len(chunks[-1])}). Consider adjusting chunk_size/overlap or merging.")
            pass # For now, we keep it. More advanced splitters handle this better.
            
    return [chunk for chunk in chunks if chunk.strip()] # Remove empty chunks


all_text_chunks = []          # This list will hold all the text pieces for embedding
chunk_source_references = []  # Metadata for each chunk (e.g., original filename)

for i, doc_text in enumerate(raw_document_texts):
    doc_metadata = document_metadata_list[i]
    
    # Using the simple character splitter:
    chunks_from_doc = simple_char_text_splitter(doc_text, chunk_size=700, chunk_overlap=100)
    # You can experiment with chunk_size and chunk_overlap.
    # For building codes, smaller, more focused chunks might be better.
    
    for chunk_idx, chunk_content in enumerate(chunks_from_doc):
        all_text_chunks.append(chunk_content)
        chunk_source_references.append({
            "original_filename": doc_metadata["filename"],
            "original_doc_index": i, # Index in raw_document_texts
            "chunk_in_doc_id": chunk_idx # Index of this chunk within its original document
        })

print(f"\nTotal text chunks created: {len(all_text_chunks)}")

if all_text_chunks:
    print("\nExample of the first few chunks:")
    for i in range(min(3, len(all_text_chunks))):
        metadata = chunk_source_references[i]
        print(f"--- Chunk {i+1} (from: {metadata['original_filename']}, part {metadata['chunk_in_doc_id'] + 1}) ---")
        print(all_text_chunks[i][:200] + "..." if len(all_text_chunks[i]) > 200 else all_text_chunks[i])
        print("-" * 20)

# IMPORTANT: The variable 'documents' in the subsequent RAG steps (embedding, FAISS, retrieval)
# should now refer to 'all_text_chunks'.
# And 'doc_filenames' should be replaced by logic using 'chunk_source_references'
# to identify the source of retrieved chunks.


Total text chunks created: 27121

Example of the first few chunks:
--- Chunk 1 (from: 2022 Title 24 Fire.pdf.md, part 1) ---
IMPORTANT NOTICE
Act now to keep your code up-to-date.
The purchase of this code includes a
free subscription for all State-issued
supplements and errata. To receive
these important updates through
20...
--------------------
--- Chunk 2 (from: 2022 Title 24 Fire.pdf.md, part 2) ---
ations, Title 24, Part 9

First Printing: July 2022

ISBN: 978-1-957212-94-4 (loose-leaf edition)
ISBN: 978-1-957212-95-1 (PDF download)

COPYRIGHT © 2022
by

INTERNATIONAL CODE COUNCIL, INC.

ALL RIG...
--------------------
--- Chunk 3 (from: 2022 Title 24 Fire.pdf.md, part 3) ---
g, without limitation, electronic, optical or mechanical means (by way of example, and not
limitation, photocopying or recording by or in an information storage and/or retrieval system). For informati...
--------------------


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np # We'll need numpy later for FAISS

# Load a pre-trained sentence transformer model
embedding_model_name = 'all-MiniLM-L6-v2' # Good balance of speed and quality
try:
    local_embedding_model = SentenceTransformer(embedding_model_name)
    print(f"Embedding model '{embedding_model_name}' loaded successfully.")
except Exception as e:
    print(f"Error loading embedding model: {e}")
    local_embedding_model = None

if local_embedding_model and 'all_text_chunks' in globals() and all_text_chunks:
    print(f"Generating embeddings for {len(all_text_chunks)} text chunks. This might take a moment...")
    document_embeddings = local_embedding_model.encode(all_text_chunks, show_progress_bar=True)
    print(f"Generated embeddings for {len(document_embeddings)} chunks.")
    print(f"Shape of embeddings matrix: {document_embeddings.shape}") # (num_chunks, embedding_dimension)
elif not all_text_chunks:
    print("The list 'all_text_chunks' is empty. Please ensure your data loading and chunking step was successful.")
    document_embeddings = None
else:
    print("Cannot proceed without an embedding model or text chunks.")
    document_embeddings = None

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embedding model 'all-MiniLM-L6-v2' loaded successfully.
Generating embeddings for 27121 text chunks. This might take a moment...


Batches:   1%|▍                                                       | 7/848 [00:41<1:20:25,  5.74s/it]