In [6]:
import os
import json
import pandas as pd
import pdfplumber
import re
from datetime import datetime

class QuestionnairePDFExtractor:
    """Class to extract questions and answers from questionnaire PDFs"""
    
    def __init__(self, output_dir=None):
        """Initialize the extractor with an optional output directory"""
        self.output_dir = output_dir
    
    def extract_questions_from_pdf(self, pdf_file):
        """Extract questions and answers from PDF"""
        print(f"Extracting questionnaire content from: {pdf_file}")
        questions_data = []
        
        try:
            with pdfplumber.open(pdf_file) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    print(f"Processing page {page_num+1} of {len(pdf.pages)}")
                    
                    # Extract text for question detection
                    text = page.extract_text()
                    
                    # Look for question patterns
                    question_pattern = re.compile(r'^\(?\d+\.?\d*\)?\s+(.+?)$', re.MULTILINE)
                    form_field_pattern = re.compile(r'(.+?):\s*$', re.MULTILINE)
                    checkbox_pattern = re.compile(r'[☐☑✓]|•\s')
                    
                    # Extract tables
                    tables = page.extract_tables()
                    
                    # Find questions in text
                    questions_in_text = []
                    for match in question_pattern.finditer(text):
                        question_text = match.group(1).strip()
                        if '?' in question_text:  # Likely a question
                            questions_in_text.append({
                                'page': page_num + 1,
                                'question_text': question_text,
                                'position': match.start(),
                                'type': 'numbered_question'
                            })
                    
                    # Find form fields
                    for match in form_field_pattern.finditer(text):
                        field_text = match.group(1).strip()
                        if len(field_text) > 5:  # Avoid small fragments
                            questions_in_text.append({
                                'page': page_num + 1,
                                'question_text': field_text,
                                'position': match.start(),
                                'type': 'form_field'
                            })
                    
                    # Process extracted tables to find question-answer pairs
                    for table_num, table in enumerate(tables):
                        df = pd.DataFrame(table)
                        
                        # Clean DataFrame
                        for col in df.columns:
                            df[col] = df[col].map(lambda x: str(x).strip() if x is not None else "", na_action='ignore')
                        
                        # Fill NaN values
                        df = df.fillna("")
                        
                        # Try to identify if this is a question-answer table
                        question_columns = []
                        answer_columns = []
                        
                        # Check each column to see if it contains questions or answers
                        for col in df.columns:
                            col_values = df[col].astype(str).tolist()
                            # Check if this looks like a question column
                            if any('?' in val for val in col_values) or any(re.match(r'^\d+\.', val) for val in col_values):
                                question_columns.append(col)
                            # Check if this looks like an answer column
                            elif any(checkbox_pattern.search(val) for val in col_values) or any(val.startswith('□') for val in col_values):
                                answer_columns.append(col)
                        
                        # If we have identified question-answer structure
                        if question_columns or answer_columns:
                            # Convert to question-answer format
                            for i, row in df.iterrows():
                                # Skip empty rows
                                if all(str(cell).strip() == "" for cell in row):
                                    continue
                                
                                # Extract question and answer
                                question_text = ""
                                if question_columns:
                                    question_text = " ".join([str(row[col]) for col in question_columns if str(row[col]).strip()])
                                else:
                                    # Try to get question from text nearby
                                    # This is approximate - might need position analysis for better accuracy
                                    question_text = f"Table {table_num+1} Row {i+1}"
                                
                                answer_data = {}
                                for col in answer_columns:
                                    answer_data[str(col)] = str(row[col])
                                
                                if not question_columns and not answer_columns:
                                    # This might be a data table, include all columns
                                    for col in df.columns:
                                        answer_data[str(col)] = str(row[col])
                                
                                # Only add if we have some content
                                if question_text.strip() or any(ans.strip() for ans in answer_data.values()):
                                    questions_data.append({
                                        'page': page_num + 1,
                                        'question_text': question_text,
                                        'answer_data': answer_data,
                                        'type': 'table_question',
                                        'table_num': table_num + 1
                                    })
                                    
                    # Add questions found in text (not in tables)
                    for question in questions_in_text:
                        # Check if we already have this from a table
                        if not any(q.get('question_text') == question['question_text'] for q in questions_data):
                            questions_data.append(question)
            
            # Sort all questions by page and position/table_num
            questions_data.sort(key=lambda x: (x['page'], x.get('position', 0) if 'position' in x else x.get('table_num', 0)))
            
            return questions_data
        
        except Exception as e:
            print(f"Error extracting questionnaire content: {str(e)}")
            return []
    
    def detect_question_type(self, question_data):
        """Detect the type of question based on answer format"""
        if 'answer_data' not in question_data:
            return 'text_question'
            
        answer_data = question_data['answer_data']
        
        # Check for checkboxes or multiple choice
        checkbox_pattern = re.compile(r'[☐☑✓]|•\s')
        has_checkbox = any(checkbox_pattern.search(str(ans)) for ans in answer_data.values())
        
        if has_checkbox:
            return 'multiple_choice'
        
        # Check for dropdown/select
        if len(answer_data) == 1 and any(str(ans).startswith('Select') for ans in answer_data.values()):
            return 'dropdown'
            
        # Check for text field
        if len(answer_data) == 1 and any('characters' in str(ans).lower() for ans in answer_data.values()):
            return 'text_field'
            
        return 'general_question'
    
    def process_and_merge_related_questions(self, questions_data):
        """Merge related questions and organize into a clearer structure"""
        merged_questions = []
        current_section = None
        i = 0
        
        while i < len(questions_data):
            question = questions_data[i]
            
            # Check if this is a section header
            if 'question_text' in question and re.match(r'^[A-Z0-9\.\s]+$', question['question_text']) and len(question['question_text']) < 50:
                current_section = question['question_text']
                i += 1
                continue
                
            # Check for multi-part questions (questions with options)
            if i < len(questions_data) - 1:
                next_question = questions_data[i + 1]
                
                # If the next question is on the same page and has options
                if (next_question['page'] == question['page'] and 
                    'type' in next_question and next_question['type'] == 'table_question' and
                    'table_num' in next_question):
                    
                    # Check if they look related
                    if 'answer_data' not in question or not question['answer_data']:
                        # Merge them
                        new_question = question.copy()
                        new_question['options'] = next_question.get('answer_data', {})
                        new_question['type'] = self.detect_question_type(next_question)
                        
                        merged_questions.append(new_question)
                        i += 2  # Skip both questions
                        continue
            
            # Add question type if not already set
            if 'type' not in question or question['type'] in ['form_field', 'numbered_question']:
                question['type'] = self.detect_question_type(question)
                
            # Add section info
            if current_section:
                question['section'] = current_section
                
            merged_questions.append(question)
            i += 1
            
        return merged_questions
    
    def convert_to_json(self, pdf_file, save_output=True):
        """Process a PDF questionnaire and convert to structured JSON"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        base_filename = os.path.splitext(os.path.basename(pdf_file))[0]
        
        # Create output directory if needed
        if self.output_dir and not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        
        # Extract questions and answers
        questions_data = self.extract_questions_from_pdf(pdf_file)
        
        # Process and merge related questions
        processed_questions = self.process_and_merge_related_questions(questions_data)
        
        # Create JSON structure
        json_data = {
            "document": {
                "filename": os.path.basename(pdf_file),
                "extraction_date": datetime.now().isoformat(),
                "questions": processed_questions
            }
        }
        
        # Save JSON file
        if save_output and self.output_dir:
            json_file = os.path.join(self.output_dir, f"{base_filename}_questionnaire_{timestamp}.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, indent=2, ensure_ascii=False)
            print(f"\nJSON output saved to: {json_file}")
        
        return json_data

# Example usage
if __name__ == "__main__":
    # Path to your PDF
    pdf_file = r'pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf'
    
    # Create output directory
    output_dir = "pdf_extractions"
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize extractor
    extractor = QuestionnairePDFExtractor(output_dir=output_dir)
    
    # Convert PDF to JSON
    result = extractor.convert_to_json(pdf_file)
    
    # Print summary
    print(f"\nExtraction complete!")
    print(f"Found {len(result['document']['questions'])} questions and response items")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Extracting questionnaire content from: pdf_files\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing page 1 of 368
Processing page 2 of 368
Processing page 3 of 368
Processing page 4 of 368
Processing page 5 of 368
Processing page 6 of 368
Processing page 7 of 368
Processing page 8 of 368
Processing page 9 of 368
Processing page 10 of 368
Processing page 11 of 368
Processing page 12 of 368
Processing page 13 of 368
Processing page 14 of 368
Processing page 15 of 368
Processing page 16 of 368
Processing page 17 of 368
Processing page 18 of 368
Processing page 19 of 368
Processing page 20 of 368
Processing page 21 of 368
Processing page 22 of 368
Processing page 23 of 368
Processing page 24 of 368
Processing page 25 of 368
Processing page 26 of 368
Processing page 27 of 368
Processing page 28 of 368
Processing page 29 of 368
Processing page 30 of 368
Processing page 31 of 368
Processing page 32 of 368
Processing page 33 of 368
Processing page 34 of 368
Processing page 35 of 368
Processing page 36 of 368
Processing page 37 of 368
Processing page 38 of 368
Processing page 39 of

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processing page 368 of 368


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def


JSON output saved to: pdf_extractions\CDP_2024_Corporate_Questionnaire_Guidance_Modules_1-6_questionnaire_20250424_121631.json

Extraction complete!
Found 4493 questions and response items
