# PDF Parsing

Copyright (c) 2025 Go2Market Insights, Inc
All rights reserved.
https://g2m.ai
The above copyright notice and this permission notice shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

## Add Libraries

In [49]:
!pip install pdfplumber



In [50]:
import os
import pandas as pd
import numpy as np
from tabulate import tabulate
import textwrap
from IPython.display import display
import pdfplumber
import re
import typing
import requests
import json
import datetime
from datetime import datetime, timedelta
from abc import ABC

## LLM Setup

In [51]:
url = "https://analyzr-llama-33-70b-test.eastus2.models.ai.azure.com/chat/completions"
key = "o5Ko0yHozfM8DYg9ogQe7lsx0SUXhJtL"

In [52]:
REQUEST_TIMEOUT = 70 # in seconds
LLM_TEMPERATURE = 0.5
LLM_MAX_TOKENS = 1500

In [53]:
LLM_ENDPOINTS = {
    'llama3_1_small': {
		'best': {
			'url': url, # URL GOES HERE, 
            'key': key, # KEY GOES HERE,
            'model_name': None, # Only necessary if model requires it (e.g. Some mistral models do require it)
		}, 
		'fast': {
            'url': None, # URL GOES HERE, 
            'key': None, # KEY GOES HERE,
            'model_name': None, # Only necessary if model requires it (e.g. Some mistral models do require it)
        }
    }, 
}

## Utility Functions

In [63]:
# G2M PDFParser
class g2mPDFParser:
    """
    Class handling i/o with the LLM 
    
    """
    # INITIALIZATION
    def __init__(self, llm='llama3_1_small', query_type='best'):
        """
        Initialize class instance 

        """
        match llm:
            # case 'mistral' | 'mistral-nemo' | 'mistral2_2411' | 'mistral-small':
            #     self.__llm = g2mLLMClientMistral(llm_type=llm, query_type=query_type)
            case 'llama3_1_small' | 'llama3_1_large' | 'llama3_3':
                self.__llm = g2mLLMClientLlama(llm_type=llm, query_type=query_type)
            case _:
                print('Unknown LLM type', f'llm_type={llm}')
                self.__llm = None 

    # READ IN TEXT FILE
    def read_in_text_file(self, file):
        with open(file, "r") as file:
            string = file.read()
        return string

    # GET FILE PATHS
    def get_file_paths(folder_path):
        file_paths = []
        for root, directories, files in os.walk(folder_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                file_paths.append(file_path)
        return file_paths
        
    # CONVERT TO TEXT
    def convert_to_text(self, pdf, filepath=None, save=True):
        with pdfplumber.open(pdf) as pdf2:
            page = pdf2.pages[0]
            text = page.extract_text()
            print(pdf)
            # print(text, "\n")
            #Save new text files
            if save: 
                root, ext = os.path.splitext(pdf)
                file = open(f'{root}-pdfplumber.txt',"w")
                file.write(text)
                file.close()
    
        return

    # CONVERT PDF(s)
    def convert_pdfs(self, files, filepath=None):
        for file in files:
            root, ext = os.path.splitext(file)
            if ext == ".pdf":
                try: 
                    self.convert_to_text(file, 'C:\\filepath')
                except: 
                    print("Warning! PDF could not be converted. ", file)

    # BULK ANSWER AND SAVE
    def bulk_answer_and_save(self, system='', files=None, save=False, filepath=None):
        for file in files: 
            user = self.read_in_text_file(file)
            res = self.query(user=user, system=system)
            # Check if res is a dictionary and handle accordingly
            if isinstance(res, dict):
                if 'text' in res:
                    text = res['text']
                else:
                    text = f"Error: {res.get('message', 'Unknown error')}"
            else:
                # This is the old code path, kept for backward compatibility
                obj = json.loads(res.content)
                text = obj['choices'][0]['text']
                
            print(text, "\n")
            if save: 
                root, ext = os.path.splitext(file)
                if filepath is not None: 
                    file_path = f'{filepath}/{root}-LLM-answer.txt'
                else:
                    file_path = f'{root}-LLM-answer.txt'
                with open(file_path, "w") as f:
                    f.write(text)
        return
            
    # QUERY LLM
    def query(self, user='', system='', temperature=LLM_TEMPERATURE, max_tokens=LLM_MAX_TOKENS, query_type='best'):
        """
        Send query to LLM. The user query and system context are provided separately.
        The full prompt is assembled here using the appropriate syntax. 

        :param user: user query, e.g. 'hello, how are you'
        :param system: system context and role, e.g. 'you are business analyst'
        :param temperature:
        :param max_tokens:
        :param query_type:
        :return res:
        """
        if self.__llm is not None:
            res = self.__llm.query(user=user, system=system, temperature=temperature, max_tokens=max_tokens, query_type=query_type)
        else:
            print('LLM type unknown, aborting LLM query', type=self.__llm)
            res = {'status': 'Unavailable', 'message': 'LLM type unknown'}
        return res 

In [67]:
# G2M LLM CLIENT BASE CLASS
class g2mLLMClientBase(ABC):
    """
    Class handling i/o with the LLM 
    
    """
    def setLLm(self, query_type='best'): 
        """
        Set the appropriate LLM, especially based on query_type attribute (e.g. 'best' || 'fast')

        :param query_type:
        :return:
        """
        self._url = LLM_ENDPOINTS[self._type][query_type]['url'] 
        self._api_key = LLM_ENDPOINTS[self._type][query_type]['key']
        self._model_name = LLM_ENDPOINTS[self._type][query_type].get('model_name', None)
        self._type = self._type
        self._query_type = query_type

    def query(self, user='', system='', temperature=LLM_TEMPERATURE, max_tokens=LLM_MAX_TOKENS, query_type='best'):
        """
        Send query to LLM. The user query and systen context are provided separately.
        The full prompt is assembled here using the appropriate syntax. 

        :param user: user query, e.g. 'hello, how are you'
        :param system: system context and role, e.g. 'you are business analyst'
        :param temperature:
        :param max_tokens:
        :param query_type:
        :return res:
        """
        self.setLLm(query_type=query_type)
        body = {
            'messages': [
                {
                    'role': 'system', 
                    'content': system, 
                }, 
                {
                    'role': 'user', 
                    'content': user, 
                }, 
            ], 
            'temperature': temperature, 
            'max_tokens': max_tokens, 
        }
        if self._model_name is not None: 
            print('Querying with model...', model_name=self._model_name)
            body['model'] = self._model_name
        return self._send_request(body)

    def _send_request(self, body):
        """
        Send JSON request to LLM API

        :param body: 
        :return res:
        """
        try:
            if self._url is not None and self._api_key is not None:
                res = requests.post(
                    self._url,
                    json=body,
                    headers={
                        "Accept": "*/*",
                        "Content-Type": "application/json",
                        "Authorization": "Bearer {}".format(self._api_key),
                    },
                    timeout = REQUEST_TIMEOUT, 
                )
                obj = json.loads(res.content)
                text = self._parse_response(obj)
                
                # Comment out or remove the LLMObs section entirely
                # LLMObs.annotate(
                #     input_data=body['messages'],
                #     output_data=[{"role": "assistant", "content": text}],
                #     metadata={
                #         "temperature": body['temperature'], 
                #         "max_tokens": body['temperature'], 
                #         "llm_type": self._type, 
                #     },
                # )
                    
                res = {'status': 'Successful', 'text': text}
            elif self._url is None:
                print('LLM URL not specified, aborting LLM query', url=self._url)
                res = {'status': 'Unavailable', 'message': 'URL not specified'}
            else:
                print('LLM access parameters not valid', url=self._url)
                res = {'status': 'Unavailable', 'message': 'Invalid access parameters'}

        except Exception as e:
            print('Cannot send LLM request: {}'.format(e), f'{self._url=}')
            res = {'status': 'Unavailable', 'message': f'Cannot send request: {str(e)}'}
        
        return res  # Note: removed the extra except block that was shadowing the specific exception    
    
    def _parse_response(self, obj):
        """
        Parse LLM response

        :param obj:
        :return text:
        """
        # log.debug('Parsing response...', obj=obj)
        if 'object' in obj.keys() and obj['object']=='Error':
            text = obj['message']
        elif 'error' in obj.keys(): 
            # Check if 'message' is a stringified JSON
            if isinstance(obj['error']['message'], str):
                try:
                    error_message = json.loads(obj['error']['message'])
                    print(f'[_parse_response] LLM response returned with error: {error_message}')
                    text = 'Unable to give a response.'
                except json.JSONDecodeError: 
                    text = obj['error']['message']
            else:
                text = obj['error']['message'].get('message', 'Unable to give a response.')
        else:
            text = obj['choices'][0]['message']['content'].strip()
        return text 


In [68]:
# G2M LLAMA LLM CLIENT
class g2mLLMClientLlama(g2mLLMClientBase):
    """
    Class handling i/o with the LLM 
    
    """

    def __init__(self, llm_type='llama3_1_small', query_type='best'):
        """
        Initialize class instance 

        """
        self._url = LLM_ENDPOINTS[llm_type][query_type]['url'] 
        self._api_key = LLM_ENDPOINTS[llm_type][query_type]['key']
        self._type = llm_type
        self._query_type = query_type

In [None]:
prompt = "you analyze documents"
files = [r'pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf']

# Example Use of G2M PDF Parser: 
parser = g2mPDFParser()
res = parser.convert_pdfs(files)
parser.bulk_answer_and_save(system=prompt, files=files, save=True)
print(res)


The CDP (formerly known as the Carbon Disclosure Project) Full Corporate Questionnaire is a comprehensive survey used to assess a company's environmental performance and sustainability practices. The questionnaire is divided into several modules, with Modules 1 to 6 covering the following topics:

**Module 1: Climate Change**

* Questions related to climate change governance, risk management, and strategy
* Disclosure of greenhouse gas (GHG) emissions, emission reduction targets, and progress towards achieving them
* Information on climate-related risks and opportunities, as well as the company's approach to managing them

**Module 2: Risks and Opportunities**

* Questions related to the identification, assessment, and management of climate-related risks and opportunities
* Disclosure of potential climate-related impacts on the company's operations, supply chain, and revenue streams
* Information on the company's approach to integrating climate-related considerations into its business 