Extract Text from PDF

In [3]:
import fitz  # PyMuPDF
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import google.generativeai as genai
import chromadb
from IPython.display import Markdown

# Step 1: Extract text from the PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

pdf_text = extract_text_from_pdf('dataset/best 55 places.pdf')

# Step 2: Improved text splitting
def split_text(text, max_chunk_size=1000):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

text_chunks = split_text(pdf_text)

# Step 3: Data Cleaning and Validation
def clean_text(text):
    clean_text = text.replace("\n", " ").replace("\r", " ").strip()
    return clean_text

cleaned_chunks = [clean_text(chunk) for chunk in text_chunks]

# Step 4: Configure Google Gemini API
API_KEY = 'AIzaSyDleZ4xVF9dCT7aw95WBeDpfHwktn4LUQ0'  # Replace with your actual API key
genai.configure(api_key=API_KEY)

class GeminiEmbeddingFunction:
    def __call__(self, input):
        model = 'models/text-embedding-004'
        response = genai.embed_content(model=model, content=input, task_type="retrieval_document")
        if 'embedding' in response:
            return response['embedding']
        else:
            raise KeyError(f"'embedding' key not found in response: {response}")

# Step 5: Store embeddings in ChromaDB
def create_chroma_db(documents, name):
    chroma_client = chromadb.Client()

    # Check if the collection exists
    try:
        existing_collection = chroma_client.get_collection(name=name)
        chroma_client.delete_collection(name=name)
    except ValueError:
        pass  # Collection does not exist

    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(
            documents=[d],
            ids=[str(i)]
        )
    return db

db = create_chroma_db(cleaned_chunks, "egypt_places_chromadb")

# Step 6: Prompt Refinement
def test_prompt(prompt, function, *args):
    try:
        response = function(prompt.format(*args))
        return response
    except Exception as e:
        print(f"Error testing prompt: {e}")
        return None

# Example function to simulate embedding generation
def generate_embedding(text):
    model = 'models/text-embedding-004'
    response = genai.embed_content(model=model, content=text, task_type="retrieval_document")
    if 'embedding' in response:
        return response['embedding']
    else:
        raise KeyError(f"'embedding' key not found in response: {response}")

# Querying and response generation functions
def get_relevant_passage(query, db):
    result = db.query(query_texts=[query], n_results=1)
    passage = result['documents'][0][0]
    return passage

# Continuous Testing and Refinement
def continuous_testing_and_refinement():
    # Define initial prompts
    prompts = {
        "embedding": "Generate an embedding for the following text: {text}",
        "querying": "Find the most relevant information about {query}.",
        "response": "Provide a detailed response for the query: {query}."
    }

    # Test and refine prompts iteratively
    for prompt_type, prompt in prompts.items():
        print(f"Testing {prompt_type} prompt...")
        text = "Pyramids in Egypt" if prompt_type == "embedding" else None
        query = "Pyramids in Egypt" if prompt_type != "embedding" else None
        result = test_prompt(prompt, generate_embedding, text) if prompt_type == "embedding" else test_prompt(prompt, get_relevant_passage, query)
        
        if not result:
            # Refine prompt if necessary
            refined_prompt = f"Refined {prompt_type} prompt text here: {query if prompt_type != 'embedding' else text}"
            result = test_prompt(refined_prompt, generate_embedding, text) if prompt_type == "embedding" else test_prompt(refined_prompt, get_relevant_passage, query)
        
        print(f"Result for {prompt_type} prompt: {result}")

# Run continuous testing and refinement process
continuous_testing_and_refinement()

# Step 7: Query the Database
query = "Pyramids in Egypt"
passage = get_relevant_passage(query, db)
Markdown(passage)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Testing embedding prompt...
Error testing prompt: 'text'
Result for embedding prompt: [-0.0061577368, 0.0056650825, -0.05347021, -0.008960371, 0.07169982, -0.0071776924, 0.05142784, -0.016967047, -0.0455444, 0.06694503, -0.020205576, 0.021651212, 0.059595615, 0.008335945, -0.006867613, -0.0587051, 0.038819063, 0.024646914, -0.09048046, 0.031070478, 0.019181607, -0.029280245, 0.041384436, -0.02323045, -0.024497753, -0.0033280808, 0.019828748, 0.005876333, 0.00877752, 0.006178875, 0.052784104, 0.0854849, 0.044268824, -0.057539735, -0.004633809, 0.028474895, -0.039763518, 0.040807806, 0.04677592, -0.014065965, -0.08813321, 0.007647823, 0.026012171, 0.053310428, -0.016109254, -0.01486942, -0.033404805, 0.035584006, -0.03878334, 0.046662156, 0.03450962, 0.004873034, -0.041459512, 0.028901022, -0.03637928, -0.011474972, -0.055063322, 0.03504847, 0.0594944, 0.029326428, 0.017033763, -0.01815436, -0.021468846, 0.035543162, -0.0030778798, 0.0030481226, -0.017004529, -0.038860958, -0.06507257, 0

Title: Great Pyramid of Giza   Description: Counted among the most popular places to visit in Egypt, the Great Pyramid of Giza is the  last attraction among the Seven Wonders of the Ancient World that have survived the rigorous test of  time. The Great Pyramid of Giza happens to be the largest and the oldest of the 3 intriguing pyramids  present in the Giza pyramid complex.As per the beliefs of the Egyptologists, this pyramid is the tomb of  pharaoh Khufu, the Fourth Dynasty Egyptian.

In [5]:
import fitz  # PyMuPDF
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import google.generativeai as genai
import chromadb
from IPython.display import Markdown

# Step 1: Extract text from the PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

pdf_text = extract_text_from_pdf('dataset/best 55 places.pdf')

# Step 2: Improved text splitting
def split_text(text, max_chunk_size=1000):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

text_chunks = split_text(pdf_text)

# Step 3: Data Cleaning and Validation
def clean_text(text):
    clean_text = text.replace("\n", " ").replace("\r", " ").strip()
    return clean_text

cleaned_chunks = [clean_text(chunk) for chunk in text_chunks]

# Step 4: Configure Google Gemini API
API_KEY = 'AIzaSyDleZ4xVF9dCT7aw95WBeDpfHwktn4LUQ0'  # Replace with your actual API key
genai.configure(api_key=API_KEY)

class GeminiEmbeddingFunction:
    def __call__(self, input):
        model = 'models/text-embedding-004'
        response = genai.embed_content(model=model, content=input, task_type="retrieval_document")
        if 'embedding' in response:
            return response['embedding']
        else:
            raise KeyError(f"'embedding' key not found in response: {response}")

# Step 5: Store embeddings in ChromaDB
def create_chroma_db(documents, name):
    chroma_client = chromadb.Client()

    # Check if the collection exists
    try:
        existing_collection = chroma_client.get_collection(name=name)
        chroma_client.delete_collection(name=name)
    except ValueError:
        pass  # Collection does not exist

    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(
            documents=[d],
            ids=[str(i)]
        )
    return db

db = create_chroma_db(cleaned_chunks, "egypt_places_chromadb")

# Step 6: Prompt Refinement
def test_prompt(prompt, function, *args):
    try:
        response = function(*args)
        return response
    except Exception as e:
        print(f"Error testing prompt: {e}")
        return None

# Example function to simulate embedding generation
def generate_embedding(text):
    model = 'models/text-embedding-004'
    response = genai.embed_content(model=model, content=text, task_type="retrieval_document")
    if 'embedding' in response:
        return response['embedding']
    else:
        raise KeyError(f"'embedding' key not found in response: {response}")

# Querying and response generation functions
def get_relevant_passage(query, db):
    result = db.query(query_texts=[query], n_results=1)
    passage = result['documents'][0][0]
    return passage

# Continuous Testing and Refinement
def continuous_testing_and_refinement(db):
    # Define initial prompts
    prompts = {
        "embedding": "Generate an embedding for the following text: {text}",
        "querying": "Find the most relevant information about {query}.",
        "response": "Provide a detailed response for the query: {query}."
    }

    # Test and refine prompts iteratively
    for prompt_type, prompt in prompts.items():
        print(f"Testing {prompt_type} prompt...")
        if prompt_type == "embedding":
            text = "Pyramids in Egypt"
            result = test_prompt(prompt, generate_embedding, text)
        else:
            query = "Pyramids in Egypt"
            result = test_prompt(prompt, get_relevant_passage, query, db)
        
        if not result:
            # Refine prompt if necessary
            refined_prompt = f"Refined {prompt_type} prompt text here: {query if prompt_type != 'embedding' else text}"
            result = test_prompt(refined_prompt, generate_embedding, text) if prompt_type == "embedding" else test_prompt(refined_prompt, get_relevant_passage, query, db)
        
        print(f"Result for {prompt_type} prompt: {result}")

# Run continuous testing and refinement process
continuous_testing_and_refinement(db)

# Step 7: Query the Database
query = "Pyramids in Egypt"
passage = get_relevant_passage(query, db)
Markdown(passage)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Testing embedding prompt...
Result for embedding prompt: [-0.01182487, 0.009895242, -0.007693494, -0.011083179, 0.06520887, -0.025734141, 0.03585604, 0.0015554574, -0.04999143, 0.07794133, 0.01065801, -0.013922917, 0.035651714, 0.0033858751, -0.0015617418, -0.051200487, 0.028781235, 0.02742034, -0.07569698, 0.03393894, 0.028366916, -0.045507204, 0.038794268, -0.039416857, -0.004912816, 0.009857073, 0.02410457, 0.04433662, -0.018626029, 0.02816102, 0.030911269, 0.064613394, 0.04317599, -0.06730251, 0.011129414, 0.038489617, -0.051886182, 0.039568845, 0.027729996, -0.0038563872, -0.05514834, 0.028468207, 0.04654607, 0.047534473, -0.028338825, -0.008407215, -0.03331705, 0.042265795, -0.04474683, 0.0148545625, 0.007850894, 0.034362026, -0.05093596, 0.046272, -0.010954443, 0.0049474123, -0.049796462, 0.041822616, 0.025068033, 0.027072063, 0.019150028, -0.0055619045, -0.033218775, 0.06578378, -0.024998676, 0.014954582, 0.00029223927, -0.04318917, -0.01777533, 0.0876081, 0.009265191, 0.115173

Result for querying prompt: Title: Great Pyramid of Giza   Description: Counted among the most popular places to visit in Egypt, the Great Pyramid of Giza is the  last attraction among the Seven Wonders of the Ancient World that have survived the rigorous test of  time. The Great Pyramid of Giza happens to be the largest and the oldest of the 3 intriguing pyramids  present in the Giza pyramid complex.As per the beliefs of the Egyptologists, this pyramid is the tomb of  pharaoh Khufu, the Fourth Dynasty Egyptian.
Testing response prompt...
Result for response prompt: Title: Great Pyramid of Giza   Description: Counted among the most popular places to visit in Egypt, the Great Pyramid of Giza is the  last attraction among the Seven Wonders of the Ancient World that have survived the rigorous test of  time. The Great Pyramid of Giza happens to be the largest and the oldest of the 3 intriguing pyramids  present in the Giza pyramid complex.As per the beliefs of the Egyptologists, this pyram

Title: Great Pyramid of Giza   Description: Counted among the most popular places to visit in Egypt, the Great Pyramid of Giza is the  last attraction among the Seven Wonders of the Ancient World that have survived the rigorous test of  time. The Great Pyramid of Giza happens to be the largest and the oldest of the 3 intriguing pyramids  present in the Giza pyramid complex.As per the beliefs of the Egyptologists, this pyramid is the tomb of  pharaoh Khufu, the Fourth Dynasty Egyptian.

In [7]:
import fitz  # PyMuPDF
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import google.generativeai as genai
import chromadb
from IPython.display import Markdown

# Step 1: Extract text from the PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

pdf_text = extract_text_from_pdf('dataset/best 55 places.pdf')

# Step 2: Improved text splitting
def split_text(text, max_chunk_size=1000):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

text_chunks = split_text(pdf_text)

# Step 3: Data Cleaning and Validation
def clean_text(text):
    clean_text = text.replace("\n", " ").replace("\r", " ").strip()
    return clean_text

cleaned_chunks = [clean_text(chunk) for chunk in text_chunks]

# Step 4: Configure Google Gemini API
API_KEY = 'AIzaSyDleZ4xVF9dCT7aw95WBeDpfHwktn4LUQ0'  # Replace with your actual API key
genai.configure(api_key=API_KEY)

class GeminiEmbeddingFunction:
    def __call__(self, input):
        model = 'models/text-embedding-004'
        response = genai.embed_content(model=model, content=input, task_type="retrieval_document")
        if 'embedding' in response:
            return response['embedding']
        else:
            raise KeyError(f"'embedding' key not found in response: {response}")

# Step 5: Store embeddings in ChromaDB
def create_chroma_db(documents, name):
    chroma_client = chromadb.Client()

    # Check if the collection exists
    try:
        existing_collection = chroma_client.get_collection(name=name)
        chroma_client.delete_collection(name=name)
    except ValueError:
        pass  # Collection does not exist

    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(
            documents=[d],
            ids=[str(i)]
        )
    return db

db = create_chroma_db(cleaned_chunks, "egypt_places_chromadb")

# Step 6: Prompt Refinement
def test_prompt(function, *args):
    try:
        response = function(*args)
        return response
    except Exception as e:
        print(f"Error testing prompt with args {args}: {e}")
        return None

# Example function to simulate embedding generation
def generate_embedding(text):
    model = 'models/text-embedding-004'
    response = genai.embed_content(model=model, content=text, task_type="retrieval_document")
    if 'embedding' in response:
        return response['embedding']
    else:
        raise KeyError(f"'embedding' key not found in response: {response}")

# Querying and response generation functions
def get_relevant_passage(query, db):
    result = db.query(query_texts=[query], n_results=1)
    passage = result['documents'][0][0]
    return passage

# Continuous Testing and Refinement
def continuous_testing_and_refinement(db):
    # Define initial prompts
    prompts = {
        "embedding": "Generate an embedding for the following text: {text}",
        "querying": "Find the most relevant information about {query}.",
        "response": "Provide a detailed response for the query: {query}."
    }

    # Test and refine prompts iteratively
    for prompt_type, prompt in prompts.items():
        print(f"Testing {prompt_type} prompt...")
        if prompt_type == "embedding":
            text = "Pyramids in Egypt"
            result = test_prompt(generate_embedding, text)
        else:
            query = "Pyramids in Egypt"
            result = test_prompt(get_relevant_passage, query, db)
        
        if not result:
            # Refine prompt if necessary
            refined_prompt = f"Refined {prompt_type} prompt text here: {query if prompt_type != 'embedding' else text}"
            result = test_prompt(generate_embedding, text) if prompt_type == "embedding" else test_prompt(get_relevant_passage, query, db)
        
        print(f"Result for {prompt_type} prompt: {result}")

# Run continuous testing and refinement process
continuous_testing_and_refinement(db)

# Step 7: Query the Database
query = "Pyramids in Egypt"
passage = get_relevant_passage(query, db)
Markdown(passage)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Testing embedding prompt...
Result for embedding prompt: [-0.01182487, 0.009895242, -0.007693494, -0.011083179, 0.06520887, -0.025734141, 0.03585604, 0.0015554574, -0.04999143, 0.07794133, 0.01065801, -0.013922917, 0.035651714, 0.0033858751, -0.0015617418, -0.051200487, 0.028781235, 0.02742034, -0.07569698, 0.03393894, 0.028366916, -0.045507204, 0.038794268, -0.039416857, -0.004912816, 0.009857073, 0.02410457, 0.04433662, -0.018626029, 0.02816102, 0.030911269, 0.064613394, 0.04317599, -0.06730251, 0.011129414, 0.038489617, -0.051886182, 0.039568845, 0.027729996, -0.0038563872, -0.05514834, 0.028468207, 0.04654607, 0.047534473, -0.028338825, -0.008407215, -0.03331705, 0.042265795, -0.04474683, 0.0148545625, 0.007850894, 0.034362026, -0.05093596, 0.046272, -0.010954443, 0.0049474123, -0.049796462, 0.041822616, 0.025068033, 0.027072063, 0.019150028, -0.0055619045, -0.033218775, 0.06578378, -0.024998676, 0.014954582, 0.00029223927, -0.04318917, -0.01777533, 0.0876081, 0.009265191, 0.115173

Result for querying prompt: Title: Great Pyramid of Giza   Description: Counted among the most popular places to visit in Egypt, the Great Pyramid of Giza is the  last attraction among the Seven Wonders of the Ancient World that have survived the rigorous test of  time. The Great Pyramid of Giza happens to be the largest and the oldest of the 3 intriguing pyramids  present in the Giza pyramid complex.As per the beliefs of the Egyptologists, this pyramid is the tomb of  pharaoh Khufu, the Fourth Dynasty Egyptian.
Testing response prompt...
Result for response prompt: Title: Great Pyramid of Giza   Description: Counted among the most popular places to visit in Egypt, the Great Pyramid of Giza is the  last attraction among the Seven Wonders of the Ancient World that have survived the rigorous test of  time. The Great Pyramid of Giza happens to be the largest and the oldest of the 3 intriguing pyramids  present in the Giza pyramid complex.As per the beliefs of the Egyptologists, this pyram

Title: Great Pyramid of Giza   Description: Counted among the most popular places to visit in Egypt, the Great Pyramid of Giza is the  last attraction among the Seven Wonders of the Ancient World that have survived the rigorous test of  time. The Great Pyramid of Giza happens to be the largest and the oldest of the 3 intriguing pyramids  present in the Giza pyramid complex.As per the beliefs of the Egyptologists, this pyramid is the tomb of  pharaoh Khufu, the Fourth Dynasty Egyptian.

In [14]:
import fitz  # PyMuPDF
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import google.generativeai as genai
import chromadb
from IPython.display import Markdown

# Step 1: Extract text from the PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

pdf_text = extract_text_from_pdf('dataset/best 55 places.pdf')

# Step 2: Improved text splitting
def split_text(text, max_chunk_size=1000):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

text_chunks = split_text(pdf_text)

# Step 3: Data Cleaning and Validation
def clean_text(text):
    clean_text = text.replace("\n", " ").replace("\r", " ").strip()
    return clean_text

cleaned_chunks = [clean_text(chunk) for chunk in text_chunks]

# Step 4: Configure Google Gemini API
API_KEY = 'Your_Google_API_Key'  # Replace with your actual API key
genai.configure(api_key=API_KEY)

class GeminiEmbeddingFunction:
    def __call__(self, input):
        model = 'models/text-embedding-004'
        response = genai.embed_content(model=model, content=input, task_type="retrieval_document")
        if 'embedding' in response:
            return response['embedding']
        else:
            raise KeyError(f"'embedding' key not found in response: {response}")

# Step 5: Store embeddings in ChromaDB
def create_chroma_db(documents, name):
    chroma_client = chromadb.Client()

    # Check if the collection exists
    try:
        existing_collection = chroma_client.get_collection(name=name)
        chroma_client.delete_collection(name=name)
    except ValueError:
        pass  # Collection does not exist

    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(
            documents=[d],
            ids=[str(i)]
        )
    return db

db = create_chroma_db(cleaned_chunks, "egypt_places_chromadb")

# Step 6: Retrieve Relevant Passages
def get_relevant_passage(query, db):
    result = db.query(query_texts=[query], n_results=5)  # Retrieve top 5 relevant passages
    passages = [res[0] for res in result['documents'][0]]
    return passages

# Step 7: Generate Response using RAG
def generate_rag_response(query, db):
    relevant_passages = get_relevant_passage(query, db)
    combined_context = " ".join(relevant_passages)
    prompt = f"Based on the following information, answer the query: {query}\n\nContext: {combined_context}"
    
    response = genai.generate_text(prompt=prompt)
    
    # Debugging: Print the response to understand its structure
    print("Response from generate_text:", response)

    # Attempt to extract the generated text
    try:
        generated_text = response.generations[0].text
    except AttributeError:
        generated_text = "Unable to generate a response."
    
    return generated_text

# Example query
query = "Pyramids in Egypt"
response = generate_rag_response(query, db)
Markdown(response)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


FileNotFoundError: no such file: '/mnt/data/best 55 places.pdf'

In [1]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

pdf_text = extract_text_from_pdf('dataset/best 55 places.pdf')
