In [7]:
import re
import os
import json
import requests
import pdfplumber
import glob
import hashlib
from abc import ABC
from difflib import SequenceMatcher

url = "https://analyzr-llama-33-70b-test.eastus2.models.ai.azure.com/chat/completions"
key = "o5Ko0yHozfM8DYg9ogQe7lsx0SUXhJtL"

# Constants from your boilerplate
REQUEST_TIMEOUT = 70  # in seconds
LLM_TEMPERATURE = 0.5
LLM_MAX_TOKENS = 3000
LLM_ENDPOINTS = {
    'llama3_3': {
        'best': {
            'url': url,  # URL GOES HERE
            'key': key,  # KEY GOES HERE
            'model_name': None,  # Only necessary if model requires it
        },
        'fast': {
            'url': None,  # URL GOES HERE
            'key': None,  # KEY GOES HERE
            'model_name': None,  # Only necessary if model requires it
        }
    },
}

# Cache class for storing LLM responses
class LLMResponseCache:
    """Cache for storing LLM responses to avoid repeated queries"""
    
    def __init__(self, cache_dir=".cache"):
        """Initialize the cache with a directory to store cache files"""
        self.cache_dir = cache_dir
        # Create cache directory if it doesn't exist
        os.makedirs(cache_dir, exist_ok=True)
        self.cache = self._load_cache()
    
    def _generate_key(self, system, user, temperature, max_tokens):
        """Generate a unique key for the cache based on inputs"""
        # Create a string with all parameters
        key_string = f"{system}|{user}|{temperature}|{max_tokens}"
        # Create a hash of this string for the key
        return hashlib.md5(key_string.encode()).hexdigest()
    
    def _get_cache_file_path(self):
        """Get the path to the cache file"""
        return os.path.join(self.cache_dir, "llm_response_cache.json")
    
    def _load_cache(self):
        """Load the cache from disk"""
        cache_file = self._get_cache_file_path()
        if os.path.exists(cache_file):
            try:
                with open(cache_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                print(f"Error loading cache: {str(e)}")
                return {}
        return {}
    
    def _save_cache(self):
        """Save the cache to disk"""
        cache_file = self._get_cache_file_path()
        try:
            with open(cache_file, 'w', encoding='utf-8') as f:
                json.dump(self.cache, f)
        except Exception as e:
            print(f"Error saving cache: {str(e)}")
    
    def get(self, system, user, temperature, max_tokens):
        """Get a response from the cache"""
        key = self._generate_key(system, user, temperature, max_tokens)
        return self.cache.get(key)
    
    def put(self, system, user, temperature, max_tokens, response):
        """Store a response in the cache"""
        key = self._generate_key(system, user, temperature, max_tokens)
        self.cache[key] = response
        self._save_cache()
    
    def clear(self):
        """Clear the entire cache"""
        self.cache = {}
        self._save_cache()
    
    def get_cache_stats(self):
        """Get statistics about the cache"""
        return {
            "total_entries": len(self.cache),
            "cache_size_bytes": os.path.getsize(self._get_cache_file_path()) if os.path.exists(self._get_cache_file_path()) else 0
        }

# Keep your existing utility classes
class g2mLLMClientBase(ABC):
    """Base class handling I/O with the LLM"""
    
    def __init__(self):
        """Initialize with cache"""
        self.cache = LLMResponseCache()
    
    def setLLm(self, query_type='best'):
        """
        Set the appropriate LLM, especially based on query_type attribute (e.g. 'best' || 'fast')

        :param query_type:
        :return:
        """
        self._url = LLM_ENDPOINTS[self._type][query_type]['url'] 
        self._api_key = LLM_ENDPOINTS[self._type][query_type]['key']
        self._model_name = LLM_ENDPOINTS[self._type][query_type].get('model_name', None)
        self._type = self._type
        self._query_type = query_type

    def query(self, user='', system='', temperature=LLM_TEMPERATURE, max_tokens=LLM_MAX_TOKENS, query_type='best', use_cache=True):
        """
        Send query to LLM. The user query and system context are provided separately.
        The full prompt is assembled here using the appropriate syntax. 

        :param user: user query, e.g. 'hello, how are you'
        :param system: system context and role, e.g. 'you are business analyst'
        :param temperature:
        :param max_tokens:
        :param query_type:
        :param use_cache: whether to use the cache
        :return res:
        """
        # Try to get from cache first if enabled
        if use_cache:
            cached_response = self.cache.get(system, user, temperature, max_tokens)
            if cached_response:
                print("Using cached LLM response")
                return cached_response
        
        # No cache hit, make a new request
        self.setLLm(query_type=query_type)
        body = {
            'messages': [
                {
                    'role': 'system', 
                    'content': system, 
                }, 
                {
                    'role': 'user', 
                    'content': user, 
                }, 
            ], 
            'temperature': temperature, 
            'max_tokens': max_tokens, 
        }
        if self._model_name is not None:
            print('Querying with model...', f'model_name={self._model_name}')
            body['model'] = self._model_name
        
        response = self._send_request(body)
        
        # Cache the response if caching is enabled
        if use_cache and response.get('status') == 'Successful':
            self.cache.put(system, user, temperature, max_tokens, response)
        
        return response

    def _send_request(self, body):
        """
        Send JSON request to LLM API

        :param body: 
        :return res:
        """
        try:
            if self._url is not None and self._api_key is not None:
                res = requests.post(
                    self._url,
                    json=body,
                    headers={
                        "Accept": "*/*",
                        "Content-Type": "application/json",
                        "Authorization": "Bearer {}".format(self._api_key),
                    },
                    timeout=REQUEST_TIMEOUT, 
                )
                obj = json.loads(res.content)
                text = self._parse_response(obj)
                
                res = {'status': 'Successful', 'text': text}
            elif self._url is None:
                print('LLM URL not specified, aborting LLM query', f'url={self._url}')
                res = {'status': 'Unavailable', 'message': 'URL not specified'}
            else:
                print('LLM access parameters not valid', f'url={self._url}')
                res = {'status': 'Unavailable', 'message': 'Invalid access parameters'}

        except Exception as e:
            print('Cannot send LLM request: {}'.format(e), f'url={self._url}')
            res = {'status': 'Unavailable', 'message': f'Cannot send request: {str(e)}'}
        
        return res
    
    def _parse_response(self, obj):
        """
        Parse LLM response

        :param obj:
        :return text:
        """
        if 'object' in obj.keys() and obj['object'] == 'Error':
            text = obj['message']
        elif 'error' in obj.keys(): 
            # Check if 'message' is a stringified JSON
            if isinstance(obj['error']['message'], str):
                try:
                    error_message = json.loads(obj['error']['message'])
                    print(f'[_parse_response] LLM response returned with error: {error_message}')
                    text = 'Unable to give a response.'
                except json.JSONDecodeError: 
                    text = obj['error']['message']
            else:
                text = obj['error']['message'].get('message', 'Unable to give a response.')
        else:
            text = obj['choices'][0]['message']['content'].strip()
        return text
    
    def clear_cache(self):
        """Clear the response cache"""
        self.cache.clear()
        print("Cache cleared")

class g2mLLMClientLlama(g2mLLMClientBase):
    """Class handling I/O with Llama LLM"""

    def __init__(self, llm_type='llama3_3', query_type='best'):
        """Initialize class instance"""
        super().__init__()  # Initialize the base class with cache
        self._url = LLM_ENDPOINTS[llm_type][query_type]['url'] 
        self._api_key = LLM_ENDPOINTS[llm_type][query_type]['key']
        self._type = llm_type
        self._query_type = query_type

class g2mPDFParser:
    """Class handling I/O with the LLM"""
    
    def __init__(self, llm='llama3_3', query_type='best'):
        """Initialize class instance"""
        match llm:
            case 'llama3_1_small' | 'llama3_1_large' | 'llama3_3':
                self.__llm = g2mLLMClientLlama(llm_type=llm, query_type=query_type)
            case _:
                print('Unknown LLM type', f'llm_type={llm}')
                self.__llm = None 
        
        # Default cache setting
        self.use_cache = True

    def read_in_text_file(self, file):
        """Read text from a file"""
        with open(file, "r", encoding="utf-8") as f:
            string = f.read()
        return string

    @staticmethod
    def get_file_paths(folder_path):
        """Get all file paths in a folder"""
        file_paths = []
        for root, directories, files in os.walk(folder_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                file_paths.append(file_path)
        return file_paths
        
    def convert_to_text(self, pdf, filepath=None, save=True):
        """Convert PDF to text"""
        with pdfplumber.open(pdf) as pdf2:
            # Extract text from all pages, not just the first one
            all_text = ""
            for page in pdf2.pages:
                text = page.extract_text()
                if text:
                    all_text += text + "\n\n"
            
            print(f"Extracted {len(all_text)} characters from {pdf}")
            
            # Save text to file
            if save: 
                root, ext = os.path.splitext(pdf)
                with open(f'{root}-pdfplumber.txt', "w", encoding="utf-8") as file:
                    file.write(all_text)
            
        return all_text

    def convert_pdfs(self, files, filepath=None):
        """Convert multiple PDFs to text"""
        results = []
        for file in files:
            root, ext = os.path.splitext(file)
            if ext.lower() == ".pdf":
                try: 
                    text = self.convert_to_text(file, filepath)
                    results.append({"file": file, "status": "success", "text": text})
                except Exception as e: 
                    print(f"Warning! PDF could not be converted: {file}. Error: {str(e)}")
                    results.append({"file": file, "status": "error", "message": str(e)})
        return results

    def bulk_answer_and_save(self, system='', files=None, save=False, filepath=None, max_sections=None):
        """
        Process multiple files with LLM
        
        :param max_sections: Maximum number of sections to process (for debugging)
        """
        if files is None:
            files = []
            
        results = []
        for file in files: 
            # Handle the file extension
            root, ext = os.path.splitext(file)
            
            try:
                if ext.lower() == ".pdf":
                    # Convert PDF to text if needed
                    text_file = f'{root}-pdfplumber.txt'
                    if not os.path.exists(text_file):
                        self.convert_to_text(file, filepath)
                elif ext.lower() == ".txt":
                    # Already a text file
                    text_file = file
                else:
                    print(f"Unsupported file type: {ext}")
                    continue
                
                # Read the text content
                user = self.read_in_text_file(text_file)
                
                # Process sections and query LLM
                sections = self.split_text_into_sections(user)
                all_questions = []
                
                # Apply section limit if specified
                if max_sections is not None and max_sections > 0:
                    print(f"Debug mode: Processing only first {max_sections} of {len(sections)} sections")
                    sections = sections[:max_sections]
                
                for i, section in enumerate(sections):
                    section_text, section_header = section
                    print(f"Processing section {i+1}/{len(sections)}: {section_header}")
                    
                    # Query LLM for this section
                    result = self.query(user=section_text, system=system, use_cache=self.use_cache)
                    
                    # Handle response
                    if isinstance(result, dict) and 'text' in result:
                        section_questions = result['text']
                    else:
                        try:
                            obj = json.loads(result.content)
                            section_questions = obj['choices'][0]['text']
                        except:
                            section_questions = f"Error processing section {section_header}"
                    
                    # Add to results
                    all_questions.append({
                        "section": section_header,
                        "questions": section_questions
                    })
                    
                # Save results if requested
                if save:
                    answer_file = f'{root}-questions.json'
                    if filepath is not None:
                        answer_file = os.path.join(filepath, os.path.basename(answer_file))
                    
                    with open(answer_file, "w", encoding="utf-8") as f:
                        json.dump(all_questions, f, indent=2)
                    
                    print(f"Saved questions to {answer_file}")
                
                results.append({
                    "file": file,
                    "status": "success",
                    "questions": all_questions
                })
                    
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
                results.append({
                    "file": file,
                    "status": "error",
                    "message": str(e)
                })
        
        return results

    def query(self, user='', system='', temperature=LLM_TEMPERATURE, max_tokens=LLM_MAX_TOKENS, query_type='best', use_cache=None):
        """
        Query the LLM with optional cache control
        
        :param use_cache: Override instance cache setting
        """
        # Determine whether to use cache
        should_use_cache = self.use_cache if use_cache is None else use_cache
        
        if self.__llm is not None:
            res = self.__llm.query(
                user=user, 
                system=system, 
                temperature=temperature, 
                max_tokens=max_tokens, 
                query_type=query_type,
                use_cache=should_use_cache
            )
        else:
            print('LLM type unknown, aborting LLM query', f'type={self.__llm}')
            res = {'status': 'Unavailable', 'message': 'LLM type unknown'}
        return res
    
    def clear_cache(self):
        """Clear the response cache"""
        if self.__llm is not None:
            self.__llm.clear_cache()
    
    def set_cache_enabled(self, enabled=True):
        """Set whether caching is enabled"""
        self.use_cache = enabled
        print(f"Caching {'enabled' if enabled else 'disabled'}")
    
    def split_text_into_sections(self, text):
        """
        Split text into sections based on section headers like (1.2) or (1.2.3)
        Returns a list of tuples: [(section_text, section_header), ...]
        """
        # Pattern to match section headers like (1.2) or (1.2.3)
        pattern = r'(\(\d+\.\d+(?:\.\d+)?\))'
        
        # Find all matches
        matches = list(re.finditer(pattern, text))
        
        sections = []
        
        # Process each match
        for i in range(len(matches)):
            # Get the current section header
            header = matches[i].group(1)
            
            # Get the start of this section
            start = matches[i].start()
            
            # Get the end of this section (start of next section or end of text)
            if i < len(matches) - 1:
                end = matches[i + 1].start()
            else:
                end = len(text)
            
            # Extract the section text
            section_text = text[start:end].strip()
            
            # Add to our list of sections
            sections.append((section_text, header))
        
        # If there's text before the first section, include it as a prologue
        if matches and matches[0].start() > 0:
            prologue = text[:matches[0].start()].strip()
            if prologue:
                sections.insert(0, (prologue, "Prologue"))
        
        # If no sections were found, return the entire text as one section
        if not sections:
            sections = [(text, "Full Document")]
        
        return sections

class SectionProcessor:
    """Process text files by section and extract questions using LLM"""
    
    def __init__(self, llm_type='llama3_3', query_type='best'):
        """Initialize with LLM client"""
        self.parser = g2mPDFParser(llm=llm_type, query_type=query_type)
        self.system_prompt = """
        You are an expert at extracting questions from text content. 
        
        Your task is to analyze the given section of text and identify any explicit or implicit questions it contains.
        
        For each question you identify:
        1. Extract or formulate the complete question
        2. Ensure the question makes sense on its own without needing additional context
        3. Preserve the original meaning and intent of the question
        
        Return ONLY a numbered list of questions, with one question per line.
        Do not include any explanations, summaries, or additional text.
        
        Important guidelines:
        - Focus on extracting questions that are seeking specific information
        - If the text contains incomplete questions, formulate them into complete questions
        - Avoid creating questions that weren't implied in the original text
        - Maintain the technical terminology and specificity of the original content
        """
    
    def process_file(self, file_path, output_dir=None, max_sections=None, use_cache=True):
        """
        Process a single text file and extract questions by section
        
        :param max_sections: Maximum number of sections to process (for debugging)
        :param use_cache: Whether to use the LLM response cache
        """
        # Set cache configuration
        self.parser.set_cache_enabled(use_cache)
        
        # Determine output directory
        if output_dir is None:
            output_dir = os.path.dirname(file_path)
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Generate output filename
        base_name = os.path.basename(file_path)
        name_without_ext = os.path.splitext(base_name)[0]
        output_file = os.path.join(output_dir, f"{name_without_ext}-questions.json")
        
        # Read the file
        if file_path.lower().endswith('.pdf'):
            # Convert PDF to text first
            text = self.parser.convert_to_text(file_path, save=True)
            if not text:
                print(f"Error: Could not extract text from PDF {file_path}")
                return None
        else:
            # Read text file directly
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            except Exception as e:
                print(f"Error reading file {file_path}: {str(e)}")
                return None
        
        # Split into sections
        sections = self.parser.split_text_into_sections(text)
        print(f"Found {len(sections)} sections in {file_path}")
        
        # Apply section limit if specified
        if max_sections is not None and max_sections > 0:
            print(f"Debug mode: Processing only first {max_sections} of {len(sections)} sections")
            sections = sections[:max_sections]
        
        # Process each section
        results = []
        for i, (section_text, section_header) in enumerate(sections):
            print(f"Processing section {i+1}/{len(sections) if max_sections is None else max_sections}: {section_header}")

            
            # Skip very short sections
            if len(section_text.strip()) < 50:
                print(f"Skipping section {section_header} - too short")
                continue
            
            # Extract questions using LLM
            try:
                response = self.parser.query(
                    user=section_text,
                    system=self.system_prompt,
                    temperature=0.3  # Lower temperature for more consistent outputs
                )
                
                if isinstance(response, dict) and 'text' in response:
                    # Process the questions - split by number and clean up
                    questions_text = response['text']
                    # Clean up the questions list
                    questions = self.clean_questions_list(questions_text)
                    
                    results.append({
                        "section_id": section_header,
                        "section_text": section_text[:100] + "..." if len(section_text) > 100 else section_text,
                        "questions": questions
                    })
                else:
                    print(f"Error: Unexpected response format for section {section_header}")
                    results.append({
                        "section_id": section_header,
                        "error": "Unexpected response format",
                        "raw_response": str(response)
                    })
            except Exception as e:
                print(f"Error processing section {section_header}: {str(e)}")
                results.append({
                    "section_id": section_header,
                    "error": str(e)
                })
        
        # Save results to JSON file
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump({
                    "file": file_path,
                    "total_sections": len(sections),
                    "processed_sections": len(results),
                    "results": results
                }, f, indent=2)
            
            print(f"Results saved to {output_file}")
            return output_file
        except Exception as e:
            print(f"Error saving results: {str(e)}")
            return None
    
    def process_directory(self, directory_path, output_dir=None, max_sections=None, use_cache=True):
        """
        Process all text and PDF files in a directory
        
        :param max_sections: Maximum number of sections to process per file (for debugging)
        :param use_cache: Whether to use the LLM response cache
        """
        if output_dir is None:
            output_dir = os.path.join(directory_path, "questions_output")
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Get all text and PDF files
        files = []
        for extension in ['.txt', '.pdf']:
            files.extend(glob.glob(os.path.join(directory_path, f"*{extension}")))
        
        results = []
        for file in files:
            print(f"Processing file: {file}")
            output_file = self.process_file(
                file, 
                output_dir, 
                max_sections=max_sections,
                use_cache=use_cache
            )
            if output_file:
                results.append({
                    "input_file": file,
                    "output_file": output_file,
                    "status": "success"
                })
            else:
                results.append({
                    "input_file": file,
                    "status": "error"
                })
        
        # Save summary
        summary_file = os.path.join(output_dir, "processing_summary.json")
        with open(summary_file, 'w', encoding='utf-8') as f:
            json.dump({
                "directory": directory_path,
                "files_processed": len(files),
                "successful": sum(1 for r in results if r["status"] == "success"),
                "failed": sum(1 for r in results if r["status"] == "error"),
                "details": results
            }, f, indent=2)
        
        return summary_file
    
    def clean_questions_list(self, text):
        """Clean up the list of questions from LLM output"""
        # Split by newlines first
        lines = text.strip().split('\n')
        
        # Clean list of questions
        questions = []
        for line in lines:
            # Skip empty lines
            if not line.strip():
                continue
                
            # Remove numbering and leading/trailing whitespace
            # Match patterns like "1.", "1)", "Question 1:", etc.
            cleaned = re.sub(r'^\s*(\d+[\.\):]|Question\s+\d+:?)\s*', '', line).strip()
            
            if cleaned:
                questions.append(cleaned)
        
        return questions
    
    def clear_cache(self):
        """Clear the LLM response cache"""
        self.parser.clear_cache()
        print("Cache cleared")
    
    def get_cache_stats(self):
        """Get statistics about the cache"""
        if hasattr(self.parser, '_g2mPDFParser__llm') and self.parser._g2mPDFParser__llm is not None:
            return self.parser._g2mPDFParser__llm.cache.get_cache_stats()
        return {"error": "LLM client not properly initialized"}


# Example usage
if __name__ == "__main__":
    # Initialize the processor
    processor = SectionProcessor(llm_type='llama3_3', query_type='best')
    
    # Create a sample text file for testing
    input_file = "pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6-debug-extraction.txt"
    with open(input_file, "w", encoding="utf-8") as f:
        f.write("""
        Introduction to the document
        
        (1.1) First Section
        This is content of the first section which discusses important concepts.
        What are the key areas to focus on for this topic?
        The document highlights several approaches to consider when implementing these ideas.
        
        (1.2) Second Section
        In this section, we examine the relationship between various factors.
        How do these factors influence outcomes?
        It's important to understand the implications for practical applications.
        
        (2.1) Another Major Section
        This section introduces new methodologies and frameworks.
        Which framework is most appropriate for different scenarios?
        Consider how these methodologies can be adapted to specific contexts.
        
        (2.1.1) Subsection with more detail
        Here we dive deeper into specific aspects of the framework.
        What are the limitations of this approach?
        Several case studies demonstrate successful implementation.
        """)
    
    # Process the sample file with caching enabled and limiting to 2 sections
    print("Processing sample file (first 2 sections only)...")
    output_file = processor.process_file(input_file, max_sections=2, use_cache=True)
    
    # Print cache stats
    print("Cache statistics:", processor.get_cache_stats())
    
    # Process the same file again, should use cached responses for the first 2 sections
    print("\nProcessing sample file again (all sections)...")
    output_file = processor.process_file(input_file, use_cache=True)
    
    # Print updated cache stats
    print("Cache statistics after second run:", processor.get_cache_stats())
    
    # Clean up sample file
    # os.remove(input_file)
    
    print(f"\nProcessing complete. Results saved to {output_file}")
    
    # Example of how to clear the cache if needed
    # processor.clear_cache()

Processing sample file (first 2 sections only)...
Caching enabled
Found 5 sections in pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6-debug-extraction.txt
Debug mode: Processing only first 2 of 5 sections
Processing section 1/2: Prologue
Skipping section Prologue - too short
Processing section 2/2: (1.1)
Using cached LLM response
Results saved to pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6-debug-extraction-questions.json
Cache statistics: {'total_entries': 4, 'cache_size_bytes': 566}

Processing sample file again (all sections)...
Caching enabled
Found 5 sections in pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6-debug-extraction.txt
Processing section 1/5: Prologue
Skipping section Prologue - too short
Processing section 2/5: (1.1)
Using cached LLM response
Processing section 3/5: (1.2)
Using cached LLM response
Processing section 4/5: (2.1)
Using cached LLM response
Processing section 5/5: (2.1.1)
Using cached LLM response
Results s

  input_file = "pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6-debug-extraction.txt"
