In [1]:
import os
import json
import pandas as pd
import pdfplumber
import re
from datetime import datetime
import requests
from abc import ABC
import time

# Configuration constants
REQUEST_TIMEOUT = 70  # in seconds
LLM_TEMPERATURE = 0.4  # Lowered for more structured/consistent output
LLM_MAX_TOKENS = 3000
LLM_ENDPOINTS = {
    'llama3_3': {
        'best': {
            'url': None,  # URL GOES HERE
            'key': None,  # KEY GOES HERE
            'model_name': None,  # Only necessary if model requires it
        },
        'fast': {
            'url': None,  # URL GOES HERE
            'key': None,  # KEY GOES HERE
            'model_name': None,  # Only necessary if model requires it
        }
    },
}

# Base LLM Client Class (from your second file)
class g2mLLMClientBase(ABC):
    """Base class handling I/O with the LLM"""
    
    def setLLm(self, query_type='best'):
        self._url = LLM_ENDPOINTS[self._type][query_type]['url']
        self._api_key = LLM_ENDPOINTS[self._type][query_type]['key']
        self._model_name = LLM_ENDPOINTS[self._type][query_type].get('model_name', None)
        self._type = self._type
        self._query_type = query_type

    def query(self, user='', system='', temperature=LLM_TEMPERATURE, max_tokens=LLM_MAX_TOKENS, query_type='best'):
        self.setLLm(query_type=query_type)
        body = {
            'messages': [
                {
                    'role': 'system',
                    'content': system,
                },
                {
                    'role': 'user',
                    'content': user,
                },
            ],
            'temperature': temperature,
            'max_tokens': max_tokens,
        }
        if self._model_name is not None:
            print('Querying with model...', f'model_name={self._model_name}')
            body['model'] = self._model_name
        return self._send_request(body)

    def _send_request(self, body):
        try:
            if self._url is not None and self._api_key is not None:
                res = requests.post(
                    self._url,
                    json=body,
                    headers={
                        "Accept": "*/*",
                        "Content-Type": "application/json",
                        "Authorization": f"Bearer {self._api_key}",
                    },
                    timeout=REQUEST_TIMEOUT,
                )
                obj = json.loads(res.content)
                text = self._parse_response(obj)
                res = {'status': 'Successful', 'text': text}
            elif self._url is None:
                print('LLM URL not specified, aborting LLM query', f'url={self._url}')
                res = {'status': 'Unavailable', 'message': 'URL not specified'}
            else:
                print('LLM access parameters not valid', f'url={self._url}')
                res = {'status': 'Unavailable', 'message': 'Invalid access parameters'}
        except Exception as e:
            print(f'Cannot send LLM request: {e}', f'url={self._url}')
            res = {'status': 'Unavailable', 'message': f'Cannot send request: {str(e)}'}
        
        return res

    def _parse_response(self, obj):
        if 'object' in obj.keys() and obj['object'] == 'Error':
            text = obj['message']
        elif 'error' in obj.keys():
            if isinstance(obj['error']['message'], str):
                try:
                    error_message = json.loads(obj['error']['message'])
                    print(f'[_parse_response] LLM response returned with error: {error_message}')
                    text = 'Unable to give a response.'
                except json.JSONDecodeError:
                    text = obj['error']['message']
            else:
                text = obj['error']['message'].get('message', 'Unable to give a response.')
        else:
            text = obj['choices'][0]['message']['content'].strip()
        return text

# Llama LLM Client (from your second file)
class g2mLLMClientLlama(g2mLLMClientBase):
    """Class handling I/O with the Llama LLM"""
    
    def __init__(self, llm_type='llama3_3', query_type='best'):
        """Initialize class instance"""
        self._url = LLM_ENDPOINTS[llm_type][query_type]['url']
        self._api_key = LLM_ENDPOINTS[llm_type][query_type]['key']
        self._type = llm_type
        self._query_type = query_type

# Section Extractor Class (from your first file)
class QuestionnaireSectionExtractor:
    """Class to extract sections and tables from questionnaire PDFs"""
    
    def __init__(self, output_dir=None, max_sections=None):
        """Initialize the extractor with an optional output directory and section limit"""
        self.output_dir = output_dir
        self.max_sections = max_sections  # New parameter for section limit
    
    def extract_from_pdf(self, pdf_file):
        """Extract sections and tables from the PDF"""
        print(f"Extracting sections and tables from: {pdf_file}")
        
        # Store extracted data
        sections = []
        current_section = None
        current_section_id = None
        page_heights = {}
        
        try:
            with pdfplumber.open(pdf_file) as pdf:
                # First pass: identify sections and their boundaries
                for page_num, page in enumerate(pdf.pages):
                    page_number = page_num + 1
                    page_heights[page_number] = page.height
                    print(f"Processing page {page_number} of {len(pdf.pages)}")
                    
                    # Extract text
                    text = page.extract_text() or ""
                    
                    # Find section headers using regex
                    # Pattern for section identifiers like (1.1) or 1.1 
                    section_pattern = re.compile(r'(?:\()?(\d+\.\d+(?:\.\d+)?)(?:\))?\s+([^\n\.]+)')
                    
                    for match in section_pattern.finditer(text):
                        section_id = match.group(1)
                        section_title = match.group(2).strip()
                        
                        # Skip very short or likely non-section titles
                        if len(section_title) < 3 or section_title.isdigit():
                            continue
                        
                        # If we have a current section, finalize it before starting a new one
                        if current_section is not None:
                            sections.append({
                                "section_id": current_section_id,
                                "section_title": current_section,
                                "start_page": current_section_page,
                                "end_page": page_number - 1 if page_number > current_section_page else page_number,
                                "start_position": current_section_position,
                                "end_position": match.start(),
                                "tables": []  # Will be filled in second pass
                            })
                            
                            # Check if we've reached the section limit
                            if self.max_sections and len(sections) >= self.max_sections:
                                print(f"Reached section limit of {self.max_sections}. Stopping extraction.")
                                break
                        
                        # Start new section
                        current_section = section_title
                        current_section_id = section_id
                        current_section_page = page_number
                        current_section_position = match.start()
                    
                    # Check if we've reached the section limit after processing this page
                    if self.max_sections and len(sections) >= self.max_sections:
                        break
                
                # Add the last section if there is one and we haven't reached the limit
                if current_section is not None and (self.max_sections is None or len(sections) < self.max_sections):
                    sections.append({
                        "section_id": current_section_id,
                        "section_title": current_section,
                        "start_page": current_section_page,
                        "end_page": len(pdf.pages),
                        "start_position": current_section_position,
                        "end_position": float('inf'),
                        "tables": []
                    })
                
                # Second pass: extract tables and assign to sections
                # Only process pages that contain our extracted sections
                relevant_pages = set()
                for section in sections:
                    for page_num in range(section["start_page"], section["end_page"] + 1):
                        relevant_pages.add(page_num)
                
                for page_num in sorted(relevant_pages):
                    page = pdf.pages[page_num - 1]  # Adjust for 0-based indexing
                    
                    # Extract tables with standard settings
                    tables = page.extract_tables()
                    
                    # Process tables on this page
                    for table_idx, table in enumerate(tables):
                        if not table or len(table) == 0:
                            continue
                        
                        # Get table bounds
                        table_bounds = self._get_table_bounds(page, table)
                        
                        # Find which section this table belongs to
                        section_idx = self._find_section_for_table(sections, page_num, table_bounds)
                        
                        if section_idx is not None:
                            # Clean and process table
                            processed_table = self._process_table(table)
                            
                            # Add to section's tables
                            sections[section_idx]["tables"].append({
                                "page": page_num,
                                "table_index": table_idx,
                                "bounds": table_bounds,
                                "data": processed_table
                            })
                    
                    # Try alternative table extraction for complex tables if needed
                    if not tables:
                        alt_tables = page.extract_tables({
                            "vertical_strategy": "text", 
                            "horizontal_strategy": "text",
                            "intersection_tolerance": 5
                        })
                        
                        for table_idx, table in enumerate(alt_tables):
                            if not table or len(table) == 0:
                                continue
                            
                            # Get table bounds (approximate for alternative tables)
                            table_bounds = self._get_table_bounds(page, table, is_alt=True)
                            
                            # Find which section this table belongs to
                            section_idx = self._find_section_for_table(sections, page_num, table_bounds)
                            
                            if section_idx is not None:
                                # Clean and process table
                                processed_table = self._process_table(table)
                                
                                # Add to section's tables
                                sections[section_idx]["tables"].append({
                                    "page": page_num,
                                    "table_index": f"alt_{table_idx}",
                                    "bounds": table_bounds,
                                    "data": processed_table
                                })
            
            # Post-process sections to identify table types and relationships
            enhanced_sections = self._enhance_sections(sections)
            
            # Create structured output
            result = {
                "document": {
                    "filename": os.path.basename(pdf_file),
                    "extraction_date": datetime.now().isoformat(),
                    "sections": enhanced_sections
                }
            }
            
            return result
            
        except Exception as e:
            print(f"Error extracting sections and tables: {str(e)}")
            import traceback
            traceback.print_exc()
            return {"error": str(e)}
    
    def _get_table_bounds(self, page, table, is_alt=False):
        """Get the bounds of a table on the page"""
        if is_alt:
            # For alternative extraction, we don't have precise bounds
            # Make an estimate based on the page dimensions
            return {
                "x0": 0,
                "top": 0,
                "x1": page.width,
                "bottom": page.height
            }
        
        # Try to find the table on the page
        tables = page.find_tables()
        
        for t in tables:
            # Check if this is our table by comparing content
            # Convert both to string representations for comparison
            table_str = str([[str(cell) for cell in row] for row in table])
            found_table_str = str([[str(cell) for cell in row] for row in t.extract()])
            
            # If content matches approximately
            if len(table_str) > 0 and len(found_table_str) > 0 and (
                table_str[:100] in found_table_str or found_table_str[:100] in table_str):
                
                return {
                    "x0": t.bbox[0],
                    "top": t.bbox[1],
                    "x1": t.bbox[2],
                    "bottom": t.bbox[3]
                }
        
        # If not found, make an estimate
        return {
            "x0": 0,
            "top": 0,
            "x1": page.width,
            "bottom": page.height
        }
    
    def _find_section_for_table(self, sections, page, table_bounds):
        """Find which section a table belongs to based on page and position"""
        for idx, section in enumerate(sections):
            # If table is on a page between section start and end
            if section["start_page"] <= page <= section["end_page"]:
                # If it's on the start page, check if it's after the section start
                if page == section["start_page"] and table_bounds["top"] < section["start_position"]:
                    continue
                
                # If it's on the end page (and not the last section), check if it's before the section end
                if (page == section["end_page"] and idx < len(sections) - 1 and 
                    table_bounds["bottom"] > section["end_position"]):
                    continue
                
                return idx
        
        # If no matching section found and we have sections, assign to the last section
        if sections:
            return len(sections) - 1
        
        return None
    
    def _process_table(self, table):
        """Clean and process a table"""
        # Convert to pandas DataFrame
        df = pd.DataFrame(table)
        
        # Handle empty DataFrame
        if df.empty:
            return []
        
        # Drop completely empty rows and columns
        df = df.dropna(how='all').dropna(axis=1, how='all')
        
        # Handle column names
        if df.columns.duplicated().any():
            # Make column names unique
            df.columns = [f"{col}_{i}" if i > 0 else col 
                         for i, col in enumerate(pd.Series(df.columns).groupby(df.columns).cumcount())]
        
        # Clean cell values
        for col in df.columns:
            df[col] = df[col].map(lambda x: str(x).strip() if x is not None else "", na_action='ignore')
        
        # Fill NaN values
        df = df.fillna("")
        
        # Convert to list of dictionaries
        return df.to_dict('records')
    
    def _enhance_sections(self, sections):
        """Enhance sections with table classification and relationships"""
        enhanced_sections = []
        
        for section in sections:
            # Skip sections with no tables
            if not section["tables"]:
                enhanced_sections.append(section)
                continue
            
            # Get all tables in the section
            tables = section["tables"]
            
            # Classify tables
            for table in tables:
                # Convert table data back to DataFrame for analysis
                df = pd.DataFrame(table["data"])
                
                # Skip empty tables
                if df.empty:
                    table["type"] = "empty"
                    continue
                
                # Analyze table content
                table["type"] = self._classify_table(df)
                
                # For question-answer tables, try to extract structure
                if table["type"] in ["question_answer", "form_field"]:
                    structure = self._extract_table_structure(df, table["type"])
                    table["structure"] = structure
            
            # Find relationships between tables
            related_tables = self._find_table_relationships(tables)
            
            # Update the section
            enhanced_section = section.copy()
            enhanced_section["tables"] = tables
            enhanced_section["table_relationships"] = related_tables
            
            enhanced_sections.append(enhanced_section)
        
        return enhanced_sections
    
    def _classify_table(self, df):
        """Classify table based on content and structure"""
        # Convert all values to strings for analysis
        str_values = [[str(cell).strip() for cell in row] for row in df.values]
        flat_values = [cell for row in str_values for cell in row if cell]
        
        # Check for common patterns
        
        # Check if this looks like a question details table
        question_details_terms = ["Question details", "Change from last year", "Rationale", "Response options"]
        if any(term in ' '.join(flat_values) for term in question_details_terms):
            return "question_details"
        
        # Check if this looks like a response table
        response_terms = ["Response", "Select from", "Text field", "Numeric field"]
        if any(term in ' '.join(flat_values) for term in response_terms):
            return "response_table"
        
        # Check if this is a horizontal vs vertical table
        # Horizontal tables typically have numbered columns (1, 2, 3...)
        first_row = str_values[0] if str_values else []
        has_numeric_headers = any(cell.isdigit() for cell in first_row)
        
        if has_numeric_headers:
            return "horizontal_table"
        
        # Check if this looks like a form field (key-value pairs)
        if df.shape[1] == 2:
            # Check if first column has consistent keys
            first_col = [str(val).strip() for val in df.iloc[:, 0]]
            if all(val and not val.isdigit() for val in first_col if val):
                return "form_field"
        
        # Check if this is a guidance table
        guidance_terms = ["General", "guidance", "Requested content", "Note:", "Example:"]
        if any(term in ' '.join(flat_values) for term in guidance_terms):
            return "guidance"
        
        # Default to general table
        return "general_table"
    
    def _extract_table_structure(self, df, table_type):
        """Extract structured data from table based on its type"""
        structure = {}
        
        if table_type == "form_field" and df.shape[1] >= 2:
            # Extract key-value pairs
            fields = []
            for _, row in df.iterrows():
                key = str(row.iloc[0]).strip()
                value = str(row.iloc[1]).strip() if df.shape[1] > 1 else ""
                
                if key:
                    # Check if value contains options
                    options = []
                    if "Select from:" in value:
                        options_text = value.split("Select from:")[1].strip()
                        # Extract options (typically bullet points)
                        options = [opt.strip(' •\n') for opt in re.split(r'[•□]\s*|\n+', options_text) 
                                   if opt.strip()]
                    
                    fields.append({
                        "field_name": key,
                        "field_value": value,
                        "options": options
                    })
            
            structure["fields"] = fields
            
        elif table_type == "question_answer" or table_type == "horizontal_table":
            # Analyze horizontal structure
            # Check if first row might be headers
            if df.shape[0] > 0:
                headers = df.iloc[0].tolist()
                headers = [str(h).strip() for h in headers]
                
                # Skip if headers are empty or all numeric
                if not all(h.isdigit() or not h for h in headers):
                    structure["headers"] = headers
                    
                    # If there are meaningful headers, extract columns
                    columns = []
                    for col_idx, header in enumerate(headers):
                        if col_idx >= df.shape[1]:
                            continue
                            
                        # Get column values (skip header row)
                        values = df.iloc[1:, col_idx].tolist()
                        values = [str(v).strip() for v in values]
                        
                        columns.append({
                            "header": header,
                            "values": values
                        })
                    
                    structure["columns"] = columns
        
        return structure
    
    def _find_table_relationships(self, tables):
        """Find relationships between tables in a section"""
        relationships = []
        
        # Group tables by page
        tables_by_page = {}
        for idx, table in enumerate(tables):
            page = table["page"]
            if page not in tables_by_page:
                tables_by_page[page] = []
            tables_by_page[page].append((idx, table))
        
        # For each page, find tables that might be related
        for page, page_tables in tables_by_page.items():
            # Sort by vertical position
            page_tables.sort(key=lambda t: t[1]["bounds"]["top"])
            
            # Check for adjacent tables that might be related
            for i in range(len(page_tables) - 1):
                idx1, table1 = page_tables[i]
                idx2, table2 = page_tables[i + 1]
                
                # Check if tables are adjacent
                if (abs(table2["bounds"]["top"] - table1["bounds"]["bottom"]) < 50 or
                    table1["type"] == "question_details" and table2["type"] == "response_table"):
                    
                    relationships.append({
                        "table1_index": idx1,
                        "table2_index": idx2,
                        "relationship_type": "adjacent"
                    })
                
                # Check for question-answer relationship
                if (table1["type"] == "question_details" and 
                    table2["type"] in ["response_table", "form_field", "horizontal_table"]):
                    
                    relationships.append({
                        "table1_index": idx1,
                        "table2_index": idx2,
                        "relationship_type": "question_answer"
                    })
        
        return relationships
    
    def convert_to_json(self, pdf_file, save_output=True):
        """Process a PDF file and convert to structured JSON"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        base_filename = os.path.splitext(os.path.basename(pdf_file))[0]
        
        # Create output directory if needed
        if self.output_dir and not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        
        # Extract sections and tables
        result = self.extract_from_pdf(pdf_file)
        
        # Save JSON file
        if save_output and self.output_dir:
            json_file = os.path.join(self.output_dir, f"{base_filename}_sections_{timestamp}.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(result, f, indent=2, ensure_ascii=False)
            print(f"\nJSON output saved to: {json_file}")
        
        # Print summary
        num_sections = len(result.get("document", {}).get("sections", []))
        total_tables = sum(len(section.get("tables", [])) for section in result.get("document", {}).get("sections", []))
        print(f"\nExtraction complete!")
        print(f"Found {num_sections} sections with {total_tables} tables")
        
        return result

# NEW CLASS: AI-Enhanced Questionnaire Parser
class AIEnhancedQuestionnaireParser:
    """Class that combines section extraction with AI analysis of questionnaire content"""
    
    def __init__(self, output_dir=None, llm_type='llama3_3', query_type='best', max_sections=None):
        """Initialize with optional output directory, LLM configuration, and section limit"""
        self.output_dir = output_dir
        if not output_dir:
            self.output_dir = "ai_enhanced_extractions"
        
        # Create output directory if it doesn't exist
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Initialize the section extractor and LLM client
        self.section_extractor = QuestionnaireSectionExtractor(
            output_dir=self.output_dir,
            max_sections=max_sections  # Pass max_sections to the section extractor
        )
        self.llm_client = g2mLLMClientLlama(llm_type=llm_type, query_type=query_type)
        self.max_sections = max_sections  # Store max_sections parameter
    
    def process_pdf(self, pdf_file, save_intermediate=True, save_final=True):
        """Process a PDF file with combined section extraction and AI analysis"""
        max_sections_info = f" (limited to {self.max_sections} sections)" if self.max_sections else ""
        print(f"Processing {pdf_file} with AI enhancement{max_sections_info}...")
        
        # Extract sections and tables using the section extractor
        extraction_result = self.section_extractor.extract_from_pdf(pdf_file)
        
        # Save intermediate extraction if requested
        if save_intermediate:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            base_filename = os.path.splitext(os.path.basename(pdf_file))[0]
            intermediate_file = os.path.join(
                self.output_dir, 
                f"{base_filename}_intermediate{max_sections_info.replace(' ', '_')}_{timestamp}.json"
            )
            with open(intermediate_file, 'w', encoding='utf-8') as f:
                json.dump(extraction_result, f, indent=2, ensure_ascii=False)
            print(f"Intermediate extraction saved to: {intermediate_file}")
        
        # Enhanced sections with AI analysis
        ai_enhanced_sections = self._process_sections_with_ai(extraction_result)
        
        # Create the final result
        final_result = {
            "document": {
                "filename": os.path.basename(pdf_file),
                "extraction_date": datetime.now().isoformat(),
                "max_sections": self.max_sections,  # Include max sections in metadata
                "sections": ai_enhanced_sections
            }
        }
        
        # Save final result if requested
        if save_final:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            base_filename = os.path.splitext(os.path.basename(pdf_file))[0]
            output_file = os.path.join(
                self.output_dir, 
                f"{base_filename}_ai_enhanced{max_sections_info.replace(' ', '_')}_{timestamp}.json"
            )
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(final_result, f, indent=2, ensure_ascii=False)
            print(f"AI-enhanced extraction saved to: {output_file}")
        
        return final_result
    
    def _process_sections_with_ai(self, extraction_result):
        """Process each section with AI to extract structured questions and answers"""
        ai_enhanced_sections = []
        sections = extraction_result.get("document", {}).get("sections", [])
        
        print(f"Processing {len(sections)} sections with AI...")
        
        for section_idx, section in enumerate(sections):
            print(f"Processing section {section_idx + 1}/{len(sections)}: {section.get('section_title', 'Untitled Section')}")
            
            # Create a structured representation of the section for the AI
            section_data = self._prepare_section_data(section)
            
            # Create the prompt for the AI
            system_prompt = self._create_system_prompt()
            user_prompt = self._create_user_prompt(section_data)
            
            # Query the LLM
            response = self.llm_client.query(
                system=system_prompt,
                user=user_prompt,
                temperature=0.3,  # Lower temperature for more structured output
                max_tokens=3000
            )
            
            # Process the AI response
            if response and response.get('status') == 'Successful':
                ai_analysis = self._parse_ai_response(response.get('text', ''))
                
                # Create an enhanced section with the AI analysis
                enhanced_section = section.copy()
                enhanced_section['ai_analysis'] = ai_analysis
                
                ai_enhanced_sections.append(enhanced_section)
            else:
                print(f"  ⚠️ AI processing failed for section {section_idx + 1}")
                ai_enhanced_sections.append(section)  # Add original section without enhancement
                
            # Pause briefly to avoid rate limits
            time.sleep(1)
        
        return ai_enhanced_sections
    
    def _prepare_section_data(self, section):
        """Prepare a structured representation of a section for the AI"""
        tables_data = []
        
        for table in section.get("tables", []):
            # Include table type classification
            table_info = {
                "type": table.get("type", "unknown"),
                "page": table.get("page", 0),
                "data": table.get("data", [])
            }
            
            # Include special structure information if available
            if "structure" in table:
                table_info["structure"] = table["structure"]
                
            tables_data.append(table_info)
        
        # Get table relationships
        relationships = section.get("table_relationships", [])
        
        # Create structured section data
        section_data = {
            "section_id": section.get("section_id", ""),
            "section_title": section.get("section_title", ""),
            "tables": tables_data,
            "table_relationships": relationships
        }
        
        return section_data
    
    def _create_system_prompt(self):
        """Create the system prompt for the AI"""
        return """You are an AI assistant specialized in analyzing questionnaire sections. Your task is to extract structured information about questions and answers from questionnaire section data.

For each section, you will:
1. Identify all questions in the section
2. Determine the type of each question (multiple choice, text input, numeric input, etc.)
3. Extract available answer options for multiple choice questions
4. Identify any guidance or instructions related to each question
5. Determine relationships between questions if they exist

Please return your analysis in a structured JSON format with the following schema:
{
  "section_summary": "Brief description of what this section is about",
  "questions": [
    {
      "question_id": "Identifier for the question (e.g., Q1)",
      "question_text": "The full text of the question",
      "question_type": "Type of question (multiple_choice, text, numeric, etc.)",
      "answer_options": ["Option 1", "Option 2", ...] (for multiple choice questions),
      "guidance": "Any guidance or instructions provided for answering the question",
      "related_questions": ["Q2", "Q3"] (if the question has relationships to other questions)
    },
    ...
  ]
}

Return only the JSON without additional commentary. Ensure the JSON is valid and properly formatted."""
    
    def _create_user_prompt(self, section_data):
        """Create the user prompt for the AI using the section data"""
        # Format tables for easier reading
        tables_formatted = []
        
        for i, table in enumerate(section_data.get("tables", [])):
            table_str = f"Table {i+1} (Type: {table.get('type', 'unknown')}):\n"
            
            # Format table data as a readable table
            data = table.get("data", [])
            if data:
                # Check if we have dictionaries or lists
                if isinstance(data[0], dict):
                    # Get all keys
                    all_keys = set()
                    for row in data:
                        all_keys.update(row.keys())
                    
                    # Create headers - FIX: Convert all keys to strings
                    all_keys = [str(k) for k in all_keys]
                    table_str += " | ".join(all_keys) + "\n"
                    table_str += "-" * (sum(len(k) for k in all_keys) + 3 * (len(all_keys) - 1)) + "\n"
                    
                    # Add data rows - FIX: ensure all values are strings
                    for row in data:
                        table_str += " | ".join([str(row.get(str(k), "")) for k in all_keys]) + "\n"
                else:
                    # Assume list of lists
                    for row in data:
                        table_str += " | ".join([str(cell) for cell in row]) + "\n"
            
            # Add table structure if available
            if "structure" in table:
                table_str += "\nStructure information:\n"
                table_str += json.dumps(table["structure"], indent=2)
            
            tables_formatted.append(table_str)
        
        # Create relationships description
        relationships_formatted = []
        for rel in section_data.get("table_relationships", []):
            relationships_formatted.append(
                f"Table {rel.get('table1_index', 0)+1} is related to Table {rel.get('table2_index', 0)+1} "
                f"as '{rel.get('relationship_type', 'unknown')}'"
            )
        
        # Construct the final prompt
        prompt = f"""Please analyze the following questionnaire section:

Section ID: {section_data.get('section_id', 'Unknown')}
Section Title: {section_data.get('section_title', 'Untitled Section')}

This section contains {len(section_data.get('tables', []))} tables:

{"".join([f"\n--- TABLE {i+1} ---\n{table}" for i, table in enumerate(tables_formatted)])}

{"".join([f"\nRelationships between tables:\n" + "\n".join(relationships_formatted)] if relationships_formatted else "")}

Based on this information, extract structured data about the questions and answers in this section. 
Focus on identifying:
1. The main questions being asked
2. The type of each question (multiple choice, free text, numeric, etc.)
3. Available answer options for multiple choice questions
4. Any guidance provided for answering questions
5. Relationships between questions

Return the results as a valid JSON object following the schema specified in the system instructions."""
        
        return prompt
    
    def _parse_ai_response(self, ai_response):
        """Parse the AI response into a structured format"""
        try:
            # Try to extract JSON from the response
            json_start = ai_response.find('{')
            json_end = ai_response.rfind('}') + 1
            
            if json_start >= 0 and json_end > json_start:
                json_str = ai_response[json_start:json_end]
                result = json.loads(json_str)
                return result
            else:
                # If no JSON found, try to parse the response as is
                result = json.loads(ai_response)
                return result
        except json.JSONDecodeError:
            print(f"Error parsing AI response as JSON. Response starts with: {ai_response[:100]}...")
            # Return a simplified version of the response if JSON parsing fails
            return {
                "section_summary": "AI analysis parsing failed",
                "raw_response": ai_response[:500] + ("..." if len(ai_response) > 500 else ""),
                "questions": []
            }

# Example usage
if __name__ == "__main__":
    # Path to your PDF
    pdf_file = "pdf_files/CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf"
    
    # Create output directory
    output_dir = "ai_enhanced_extractions"
    os.makedirs(output_dir, exist_ok=True)
    
    # Process with section limit (e.g., only process first 5 sections)
    max_sections = 5  # Set to None to process all sections
    
    # Initialize the AI-enhanced parser with section limit
    parser = AIEnhancedQuestionnaireParser(
        output_dir=output_dir,
        max_sections=max_sections
    )
    
    # Process the PDF with AI enhancement
    result = parser.process_pdf(pdf_file)
    
    # Print a summary
    section_count = len(result.get("document", {}).get("sections", []))
    question_count = 0
    for section in result.get("document", {}).get("sections", []):
        question_count += len(section.get("ai_analysis", {}).get("questions", []))
    
    print(f"\nAI-enhanced processing complete!")
    print(f"Processed {section_count} sections containing approximately {question_count} questions")
    print(f"Section limit was set to: {max_sections if max_sections else 'No limit'}")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing pdf_files/CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf with AI enhancement (limited to 5 sections)...
Extracting sections and tables from: pdf_files/CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing page 1 of 368
Processing page 2 of 368
Processing page 3 of 368
Reached section limit of 5. Stopping extraction.


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Intermediate extraction saved to: ai_enhanced_extractions\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6_intermediate_(limited_to_5_sections)_20250424_134212.json
Processing 5 sections with AI...
Processing section 1/5: Released: May 1, 2024 Publication of the CDP full corporate
LLM URL not specified, aborting LLM query url=None
  ⚠️ AI processing failed for section 1
Processing section 2/5: Released: May 16, 2024 Addition of IFRS S1 20 tag and removal of
LLM URL not specified, aborting LLM query url=None
  ⚠️ AI processing failed for section 2
Processing section 3/5: Page 2 of 368 @cdp | www
LLM URL not specified, aborting LLM query url=None
  ⚠️ AI processing failed for section 3
Processing section 4/5: In which language are you submitting your response?
LLM URL not specified, aborting LLM query url=None
  ⚠️ AI processing failed for section 4
Processing section 5/5: Select the currency used for all financial information disclosed throughout your response
LLM URL not specified