# PDF Parsing

Copyright (c) 2025 Go2Market Insights, Inc
All rights reserved.
https://g2m.ai
The above copyright notice and this permission notice shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

## Add Libraries

In [93]:
!pip install pdfplumber



In [94]:
import os
import pandas as pd
import numpy as np
from tabulate import tabulate
import textwrap
from IPython.display import display
import pdfplumber
import re
import typing
import requests
import json
import datetime
from datetime import datetime, timedelta
from abc import ABC

## LLM Setup

In [95]:
url = "https://analyzr-llama-33-70b-test.eastus2.models.ai.azure.com/chat/completions"
key = "o5Ko0yHozfM8DYg9ogQe7lsx0SUXhJtL"

In [None]:
REQUEST_TIMEOUT = 70 # in seconds
LLM_TEMPERATURE = 0.5
LLM_MAX_TOKENS = 3000

In [97]:
LLM_ENDPOINTS = {
    'llama3_3': {
		'best': {
			'url': url, # URL GOES HERE, 
            'key': key, # KEY GOES HERE,
            'model_name': None, # Only necessary if model requires it (e.g. Some mistral models do require it)
		}, 
		'fast': {
            'url': None, # URL GOES HERE, 
            'key': None, # KEY GOES HERE,
            'model_name': None, # Only necessary if model requires it (e.g. Some mistral models do require it)
        }
    }, 
}

## Utility Functions

In [98]:
# G2M PDFParser
class g2mPDFParser:
    """
    Class handling i/o with the LLM 
    
    """
    # INITIALIZATION
    def __init__(self, llm='llama3_3', query_type='best'):
        """
        Initialize class instance 

        """
        match llm:
            # case 'mistral' | 'mistral-nemo' | 'mistral2_2411' | 'mistral-small':
            #     self.__llm = g2mLLMClientMistral(llm_type=llm, query_type=query_type)
            case 'llama3_1_small' | 'llama3_1_large' | 'llama3_3':
                self.__llm = g2mLLMClientLlama(llm_type=llm, query_type=query_type)
            case _:
                print('Unknown LLM type', f'llm_type={llm}')
                self.__llm = None 

    # READ IN TEXT FILE
    def read_in_text_file(self, file):
        with open(file, "r") as file:
            string = file.read()
        return string

    # GET FILE PATHS
    def get_file_paths(folder_path):
        file_paths = []
        for root, directories, files in os.walk(folder_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                file_paths.append(file_path)
        return file_paths
        
    # CONVERT TO TEXT
    def convert_to_text(self, pdf, filepath=None, save=True):
        with pdfplumber.open(pdf) as pdf2:
            page = pdf2.pages[0]
            text = page.extract_text()
            print(pdf)
            # print(text, "\n")
            #Save new text files
            if save: 
                root, ext = os.path.splitext(pdf)
                file = open(f'{root}-pdfplumber.txt',"w")
                file.write(text)
                file.close()
    
        return

    # CONVERT PDF(s)
    def convert_pdfs(self, files, filepath=None):
        for file in files:
            root, ext = os.path.splitext(file)
            if ext == ".pdf":
                try: 
                    self.convert_to_text(file, 'C:\\filepath')
                except: 
                    print("Warning! PDF could not be converted. ", file)

    # BULK ANSWER AND SAVE
    def bulk_answer_and_save(self, system='', files=None, save=False, filepath=None):
        for file in files: 
            # Change this part to read the converted text file
            root, ext = os.path.splitext(file)
            text_file = f'{root}-pdfplumber.txt'
            
            # First ensure that we've converted the PDF to text
            if ext.lower() == ".pdf":
                try:
                    self.convert_to_text(file, filepath)
                except:
                    print(f"Warning! PDF could not be converted: {file}")
                    continue
            
            # Now read the text file
            try:
                user = self.read_in_text_file(text_file)
                res = self.query(user=user, system=system)
                
                # Handle the response
                if isinstance(res, dict):
                    if 'text' in res:
                        text = res['text']
                    else:
                        text = f"Error: {res.get('message', 'Unknown error')}"
                else:
                    obj = json.loads(res.content)
                    text = obj['choices'][0]['text']
                    
                print(text, "\n")
                
                if save: 
                    answer_file = f'{root}-LLM-answer.txt'
                    if filepath is not None: 
                        answer_file = f'{filepath}/{answer_file}'
                    with open(answer_file, "w") as f:
                        f.write(text)
            except Exception as e:
                print(f"Error processing {text_file}: {str(e)}")
            
    # QUERY LLM
    def query(self, user='', system='', temperature=LLM_TEMPERATURE, max_tokens=LLM_MAX_TOKENS, query_type='best'):
        """
        Send query to LLM. The user query and system context are provided separately.
        The full prompt is assembled here using the appropriate syntax. 

        :param user: user query, e.g. 'hello, how are you'
        :param system: system context and role, e.g. 'you are business analyst'
        :param temperature:
        :param max_tokens:
        :param query_type:
        :return res:
        """
        if self.__llm is not None:
            res = self.__llm.query(user=user, system=system, temperature=temperature, max_tokens=max_tokens, query_type=query_type)
        else:
            print('LLM type unknown, aborting LLM query', type=self.__llm)
            res = {'status': 'Unavailable', 'message': 'LLM type unknown'}
        return res 

In [99]:
# G2M LLM CLIENT BASE CLASS
class g2mLLMClientBase(ABC):
    """
    Class handling i/o with the LLM 
    
    """
    def setLLm(self, query_type='best'): 
        """
        Set the appropriate LLM, especially based on query_type attribute (e.g. 'best' || 'fast')

        :param query_type:
        :return:
        """
        self._url = LLM_ENDPOINTS[self._type][query_type]['url'] 
        self._api_key = LLM_ENDPOINTS[self._type][query_type]['key']
        self._model_name = LLM_ENDPOINTS[self._type][query_type].get('model_name', None)
        self._type = self._type
        self._query_type = query_type

    def query(self, user='', system='', temperature=LLM_TEMPERATURE, max_tokens=LLM_MAX_TOKENS, query_type='best'):
        """
        Send query to LLM. The user query and systen context are provided separately.
        The full prompt is assembled here using the appropriate syntax. 

        :param user: user query, e.g. 'hello, how are you'
        :param system: system context and role, e.g. 'you are business analyst'
        :param temperature:
        :param max_tokens:
        :param query_type:
        :return res:
        """
        self.setLLm(query_type=query_type)
        body = {
            'messages': [
                {
                    'role': 'system', 
                    'content': system, 
                }, 
                {
                    'role': 'user', 
                    'content': user, 
                }, 
            ], 
            'temperature': temperature, 
            'max_tokens': max_tokens, 
        }
        if self._model_name is not None: 
            print('Querying with model...', model_name=self._model_name)
            body['model'] = self._model_name
        return self._send_request(body)

    def _send_request(self, body):
        """
        Send JSON request to LLM API

        :param body: 
        :return res:
        """
        try:
            if self._url is not None and self._api_key is not None:
                res = requests.post(
                    self._url,
                    json=body,
                    headers={
                        "Accept": "*/*",
                        "Content-Type": "application/json",
                        "Authorization": "Bearer {}".format(self._api_key),
                    },
                    timeout = REQUEST_TIMEOUT, 
                )
                obj = json.loads(res.content)
                text = self._parse_response(obj)
                
                # Comment out or remove the LLMObs section entirely
                # LLMObs.annotate(
                #     input_data=body['messages'],
                #     output_data=[{"role": "assistant", "content": text}],
                #     metadata={
                #         "temperature": body['temperature'], 
                #         "max_tokens": body['temperature'], 
                #         "llm_type": self._type, 
                #     },
                # )
                    
                res = {'status': 'Successful', 'text': text}
            elif self._url is None:
                print('LLM URL not specified, aborting LLM query', url=self._url)
                res = {'status': 'Unavailable', 'message': 'URL not specified'}
            else:
                print('LLM access parameters not valid', url=self._url)
                res = {'status': 'Unavailable', 'message': 'Invalid access parameters'}

        except Exception as e:
            print('Cannot send LLM request: {}'.format(e), f'{self._url=}')
            res = {'status': 'Unavailable', 'message': f'Cannot send request: {str(e)}'}
        
        return res  # Note: removed the extra except block that was shadowing the specific exception    
    
    def _parse_response(self, obj):
        """
        Parse LLM response

        :param obj:
        :return text:
        """
        # log.debug('Parsing response...', obj=obj)
        if 'object' in obj.keys() and obj['object']=='Error':
            text = obj['message']
        elif 'error' in obj.keys(): 
            # Check if 'message' is a stringified JSON
            if isinstance(obj['error']['message'], str):
                try:
                    error_message = json.loads(obj['error']['message'])
                    print(f'[_parse_response] LLM response returned with error: {error_message}')
                    text = 'Unable to give a response.'
                except json.JSONDecodeError: 
                    text = obj['error']['message']
            else:
                text = obj['error']['message'].get('message', 'Unable to give a response.')
        else:
            text = obj['choices'][0]['message']['content'].strip()
        return text 


In [100]:
# G2M LLAMA LLM CLIENT
class g2mLLMClientLlama(g2mLLMClientBase):
    """
    Class handling i/o with the LLM 
    
    """

    def __init__(self, llm_type='llama3_1_small', query_type='best'):
        """
        Initialize class instance 

        """
        self._url = LLM_ENDPOINTS[llm_type][query_type]['url'] 
        self._api_key = LLM_ENDPOINTS[llm_type][query_type]['key']
        self._type = llm_type
        self._query_type = query_type

In [101]:
prompt = "PROVIDE A VERBATIM TRANSCRIPTION OF THE DOCUMENT WITHOUT ANY SUMMARIZATION OR ANALYSIS"
files = [r'pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf']

# Example Use of G2M PDF Parser: 
parser = g2mPDFParser()
res = parser.convert_pdfs(files)
parser.bulk_answer_and_save(system=prompt, files=files, save=True)
print(res)


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

I'm happy to provide the CDP Full Corporate Questionnaire, Module 1 to 6. However, please note that the CDP (Carbon Disclosure Project) questionnaire is a lengthy document, and I'll provide it verbatim as requested.

**Module 1: Climate Change**

1.1 What is your organization's approach to climate change?
1.2 Has your organization set greenhouse gas (GHG) reduction targets?
1.3 What are your organization's GHG emissions (Scope 1, 2, and 3) for the most recent reporting year?
1.4 Has your organization conducted a climate change risk assessment?
1.5 How does your organization engage with stakeholders on climate change issues?

**Module 2: Governance**

2.1 Who is responsible for overseeing climate change issues within your organization?
2.2 Has your organization's board of directors (or equivalent) discussed climate change in the past two years?
2.3 Does your organization have a climate change policy or strategy?
2.4 How does your organization ensure that climate change is integrated int

In [102]:
def debug_pdf_extraction(pdf_file):
    """Test function to extract and print text from a PDF"""
    import pdfplumber
    import os
    
    print(f"Testing extraction from: {pdf_file}")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            # Check PDF properties
            num_pages = len(pdf.pages)
            print(f"PDF has {num_pages} pages")
            
            # Extract text from first page (as in original code)
            text_first_page = pdf.pages[0].extract_text()
            print(f"\n--- First page text sample (first 200 chars) ---")
            print(text_first_page[:200] if text_first_page else "No text extracted")
            
            # Extract text from all pages for comparison
            full_text = ""
            for i, page in enumerate(pdf.pages):
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\n\n"
                else:
                    print(f"Warning: No text extracted from page {i+1}")
            
            # Save full text to a file for inspection
            text_file_path = f"{os.path.splitext(pdf_file)[0]}-debug-extraction.txt"
            with open(text_file_path, "w", encoding="utf-8") as f:
                f.write(full_text)
            
            print(f"Full text saved to: {text_file_path}")
            print(f"Full text length: {len(full_text)} characters")
            
            return full_text, num_pages
    except Exception as e:
        print(f"Error extracting PDF: {str(e)}")
        return None, 0

def test_llm_query(parser, sample_text, system_prompt):
    """Test the LLM response with text sample"""
    print(f"\n--- Testing LLM with sample text ---")
    print(f"System prompt: '{system_prompt}'")
    print(f"Sample length: {len(sample_text)} characters")
    print(f"Sample start: {sample_text[:100]}...")
    
    try:
        # Use a smaller sample for testing
        sample_to_use = sample_text[:1500] if len(sample_text) > 1500 else sample_text
        response = parser.query(user=sample_to_use, system=system_prompt)
        
        if isinstance(response, dict) and 'text' in response:
            print(f"Response length: {len(response['text'])} characters")
            print(f"Response sample: {response['text'][:200]}...")
            
            # Compare input and output similarity
            similarity = SequenceMatcher(None, sample_to_use, response['text']).ratio()
            print(f"Text similarity ratio: {similarity:.2f}")
            
            return response['text']
        else:
            print(f"Unexpected response format: {response}")
            return None
    except Exception as e:
        print(f"Error querying LLM: {str(e)}")
        return None

def full_debug_workflow(pdf_file, parser):
    """Run a complete debugging workflow on the PDF processing"""
    import os
    
    print(f"\n=== DEBUGGING PDF PROCESSING WORKFLOW ===\n")
    
    # 1. Extract and examine the PDF text
    pdf_text, num_pages = debug_pdf_extraction(pdf_file)
    if not pdf_text:
        print("PDF extraction failed. Cannot proceed with LLM testing.")
        return
    
    # 2. Test different prompts with the LLM
    test_prompts = [
        "PROVIDE A VERBATIM TRANSCRIPTION OF THE DOCUMENT WITHOUT ANY SUMMARIZATION OR ANALYSIS",
        "Return the provided text exactly as given, without any modifications, analysis, or summarization.",
        "You are a text processing system. Your only task is to reproduce the text provided to you exactly as is."
    ]
    
    for i, prompt in enumerate(test_prompts):
        print(f"\n--- Testing Prompt #{i+1} ---")
        response_text = test_llm_query(parser, pdf_text[:2000], prompt)
        
        if response_text:
            # Save response for manual comparison
            response_file = f"{os.path.splitext(pdf_file)[0]}-llm-response-{i+1}.txt"
            with open(response_file, "w", encoding="utf-8") as f:
                f.write(response_text)
            print(f"Response saved to: {response_file}")
    
    # 3. Test full PDF processing with improved method
    print("\n--- Testing complete PDF processing with improved method ---")
    try:
        # Get the full text
        full_pdf_text = pdf_text
        
        # Use the best prompt based on previous tests (adjust as needed)
        best_prompt = "Return the provided text exactly as given, without any modifications, analysis, or summarization."
        
        # Process in chunks if the text is too large
        if len(full_pdf_text) > 6000:
            print(f"PDF text is large ({len(full_pdf_text)} chars), processing first chunk only")
            chunk = full_pdf_text[:6000]
        else:
            chunk = full_pdf_text
        
        # Query LLM with the chunk
        response = parser.query(user=chunk, system=best_prompt, temperature=0.1)
        
        if isinstance(response, dict) and 'text' in response:
            result_file = f"{os.path.splitext(pdf_file)[0]}-final-result.txt"
            with open(result_file, "w", encoding="utf-8") as f:
                f.write(response['text'])
            print(f"Final result saved to: {result_file}")
            
            # Compare similarity
            similarity = SequenceMatcher(None, chunk, response['text']).ratio()
            print(f"Final result similarity ratio: {similarity:.2f}")
        else:
            print(f"Final processing failed: {response}")
    
    except Exception as e:
        print(f"Error in full processing test: {str(e)}")

# Example usage - include this at the end of your script:
if __name__ == "__main__":
    # Initialize your parser
    parser = g2mPDFParser(llm='llama3_3', query_type='best')
    
    # Path to your PDF file
    pdf_file = r'pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf'
    
    # Run the debugging workflow
    full_debug_workflow(pdf_file, parser)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def


=== DEBUGGING PDF PROCESSING WORKFLOW ===

Testing extraction from: pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

PDF has 368 pages

--- First page text sample (first 200 chars) ---
CDP Full Corporate Questionnaire
Module 1 to 6


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Full text saved to: pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6-debug-extraction.txt
Full text length: 992177 characters


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def


--- Testing Prompt #1 ---

--- Testing LLM with sample text ---
System prompt: 'PROVIDE A VERBATIM TRANSCRIPTION OF THE DOCUMENT WITHOUT ANY SUMMARIZATION OR ANALYSIS'
Sample length: 2000 characters
Sample start: CDP Full Corporate Questionnaire
Module 1 to 6

Version
Version number Release / Revision date Revis...
Response length: 1631 characters
Response sample: CDP Full Corporate Questionnaire
Module 1 to 6

Version
Version number Release / Revision date Revision summary
1.0 Released: May 1, 2024 Publication of the CDP full corporate questionnaire
1.1 Releas...
Error querying LLM: name 'SequenceMatcher' is not defined

--- Testing Prompt #2 ---

--- Testing LLM with sample text ---
System prompt: 'Return the provided text exactly as given, without any modifications, analysis, or summarization.'
Sample length: 2000 characters
Sample start: CDP Full Corporate Questionnaire
Module 1 to 6

Version
Version number Release / Revision date Revis...
Response length: 1450 characters
Response

KeyboardInterrupt: 

In [None]:
def debug_pdf_extraction(pdf_file):
    """Test function to extract and print text from a PDF"""
    import pdfplumber
    import os
    
    print(f"Testing extraction from: {pdf_file}")
    try:
        with pdfplumber.open(pdf_file) as pdf:
            # Check PDF properties
            num_pages = len(pdf.pages)
            print(f"PDF has {num_pages} pages")
            
            # Extract text from first page (as in original code)
            text_first_page = pdf.pages[0].extract_text()
            print(f"\n--- First page text sample (first 200 chars) ---")
            print(text_first_page[:200] if text_first_page else "No text extracted")
            
            # Extract text from all pages for comparison
            full_text = ""
            for i, page in enumerate(pdf.pages):
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\n\n"
                else:
                    print(f"Warning: No text extracted from page {i+1}")
            
            # Save full text to a file for inspection
            text_file_path = f"{os.path.splitext(pdf_file)[0]}-debug-extraction.txt"
            with open(text_file_path, "w", encoding="utf-8") as f:
                f.write(full_text)
            
            print(f"Full text saved to: {text_file_path}")
            print(f"Full text length: {len(full_text)} characters")
            
            return full_text, num_pages
    except Exception as e:
        print(f"Error extracting PDF: {str(e)}")
        return None, 0

def test_llm_query(parser, sample_text, system_prompt):
    """Test the LLM response with text sample"""
    print(f"\n--- Testing LLM with sample text ---")
    print(f"System prompt: '{system_prompt}'")
    print(f"Sample length: {len(sample_text)} characters")
    print(f"Sample start: {sample_text[:100]}...")
    
    try:
        # Use a smaller sample for testing
        sample_to_use = sample_text[:1500] if len(sample_text) > 1500 else sample_text
        response = parser.query(user=sample_to_use, system=system_prompt)
        
        if isinstance(response, dict) and 'text' in response:
            print(f"Response length: {len(response['text'])} characters")
            print(f"Response sample: {response['text'][:200]}...")
            
            # Compare input and output similarity
            similarity = SequenceMatcher(None, sample_to_use, response['text']).ratio()
            print(f"Text similarity ratio: {similarity:.2f}")
            
            return response['text']
        else:
            print(f"Unexpected response format: {response}")
            return None
    except Exception as e:
        print(f"Error querying LLM: {str(e)}")
        return None

def full_debug_workflow(pdf_file, parser):
    """Run a complete debugging workflow on the PDF processing"""
    import os
    
    print(f"\n=== DEBUGGING PDF PROCESSING WORKFLOW ===\n")
    
    # 1. Extract and examine the PDF text
    pdf_text, num_pages = debug_pdf_extraction(pdf_file)
    if not pdf_text:
        print("PDF extraction failed. Cannot proceed with LLM testing.")
        return
    
    # 2. Test different prompts with the LLM
    test_prompts = [
        "PROVIDE A VERBATIM TRANSCRIPTION OF THE DOCUMENT WITHOUT ANY SUMMARIZATION OR ANALYSIS",
        "Return the provided text exactly as given, without any modifications, analysis, or summarization.",
        "You are a text processing system. Your only task is to reproduce the text provided to you exactly as is."
    ]
    
    for i, prompt in enumerate(test_prompts):
        print(f"\n--- Testing Prompt #{i+1} ---")
        response_text = test_llm_query(parser, pdf_text[:2000], prompt)
        
        if response_text:
            # Save response for manual comparison
            response_file = f"{os.path.splitext(pdf_file)[0]}-llm-response-{i+1}.txt"
            with open(response_file, "w", encoding="utf-8") as f:
                f.write(response_text)
            print(f"Response saved to: {response_file}")
    
    # 3. Test full PDF processing with improved method
    print("\n--- Testing complete PDF processing with improved method ---")
    try:
        # Get the full text
        full_pdf_text = pdf_text
        
        # Use the best prompt based on previous tests (adjust as needed)
        best_prompt = "Return the provided text exactly as given, without any modifications, analysis, or summarization."
        
        # Process in chunks if the text is too large
        if len(full_pdf_text) > 6000:
            print(f"PDF text is large ({len(full_pdf_text)} chars), processing first chunk only")
            chunk = full_pdf_text[:6000]
        else:
            chunk = full_pdf_text
        
        # Query LLM with the chunk
        response = parser.query(user=chunk, system=best_prompt, temperature=0.1)
        
        if isinstance(response, dict) and 'text' in response:
            result_file = f"{os.path.splitext(pdf_file)[0]}-final-result.txt"
            with open(result_file, "w", encoding="utf-8") as f:
                f.write(response['text'])
            print(f"Final result saved to: {result_file}")
            
            # Compare similarity
            similarity = SequenceMatcher(None, chunk, response['text']).ratio()
            print(f"Final result similarity ratio: {similarity:.2f}")
        else:
            print(f"Final processing failed: {response}")
    
    except Exception as e:
        print(f"Error in full processing test: {str(e)}")

# Example usage - include this at the end of your script:
if __name__ == "__main__":
    # Initialize your parser
    parser = g2mPDFParser(llm='llama3_3', query_type='best')
    
    # Path to your PDF file
    pdf_file = r'pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf'
    
    # Run the debugging workflow
    full_debug_workflow(pdf_file, parser)