In [None]:
from utils.parser import PDFExtractor, list_pdfs_in_folder

# Example usage


folder_path="data/documents"
doc_list = list_pdfs_in_folder(folder_path)


for doc in doc_list:
    pdf_path = "data/documents/{}".format(doc)
    extractor = PDFExtractor(pdf_path, output_folder="outputs", config_path="utils/text_cleanup_config.json")
    extracted_data = extractor.extract_all()

In [67]:
test=extracted_data["processed_text"]

In [None]:
def restructure_dict(input_dict):
    def add_to_hierarchy(hierarchy, levels, key, value):
        """Recursively add a key-value pair into the nested dictionary based on levels."""
        current_level = levels[0]
        if len(levels) == 1:
            # Add to current level
            hierarchy[key] = value
        else:
            # Recurse into the next level
            if current_level not in hierarchy:
                hierarchy[current_level] = {"intro": ""}
            add_to_hierarchy(hierarchy[current_level], levels[1:], key, value)

    result = {}
    for key, value in input_dict.items():
        # Split the key into numbering and title
        key_parts = key.split(' ', 1)
        numbering = key_parts[0]
        title = key_parts[1] if len(key_parts) > 1 else ""

        if '.' in numbering and numbering.replace('.', '').isdigit():
            # Hierarchical key: extract levels
            levels = numbering.strip('.').split('.')
            add_to_hierarchy(result, levels, key, value)
        else:
            # Top-level or non-numbered key
            result[key] = value

    # Clean up unnecessary nesting
    for top_key in result.keys():
        if isinstance(result[top_key], dict) and "intro" in result[top_key] and not result[top_key]["intro"]:
            result[top_key].pop("intro")
    return result


restructure_dict(test)


In [None]:
def create_hierarchy(input_dict):
    """
    Organizes a dictionary into a hierarchy based on numbered keys.
    
    Parameters:
        input_dict (dict): A dictionary where keys are titles (some containing numbering).
    
    Returns:
        dict: A new dictionary with a hierarchical structure based on numbered keys.
    """
    import re
    from collections import defaultdict
    
    def add_to_hierarchy(hierarchy, key_parts, value):
        """Recursively add keys to the hierarchy."""
        current = hierarchy
        for part in key_parts[:-1]:
            if part not in current:
                current[part] = {}
            current = current[part]
        current[key_parts[-1]] = value

    # Regular expression to match numbered titles (e.g., "6.", "6.1.", "6.2.")
    numbered_pattern = re.compile(r"^(\d+(\.\d+)*)\.$")
    
    hierarchy = {}
    non_numbered = {}
    empty_titles = {}

    for key, value in input_dict.items():
        if key.strip() == "":  # Handle empty titles
            empty_titles[key] = value
        elif numbered_pattern.match(key):  # Match numbered titles
            hierarchy_key = tuple(key.rstrip('.').split('.'))
            add_to_hierarchy(hierarchy, hierarchy_key, value)
        else:  # Non-numbered titles
            non_numbered[key] = value

    # Merge non-numbered and empty titles into the hierarchy
    hierarchy.update(non_numbered)
    hierarchy.update(empty_titles)

    return hierarchy


test_dict = {
    'Abstract': 'text',
    '1. Introduction': 'text',
    '2. General quality issues': 'text',
    '2.1. OnlineJobAdvertisements,VacanciesandCapacity': 'text',
    '2.2. Usingonlinejobadvertisementsforeconomicresearch': 'text',
    '3. Methods': '',
    '3.1. Basic settings': 'text',
    '3.2. Unitlevel methods': 'text',
    '3.2.1. Modelling misclassification': 'text',
    '3.2.2. Nonignorable selection:': 'text',
    '3.2.3. Single source capturerecapture': 'text'
}
restructured_dict = restructure_dict(test_dict)
import pprint
pprint.pprint(restructured_dict)


In [None]:
import re
from collections import defaultdict

def organize_by_levels(input_dict):
    """
    Organizes titles and content into hierarchical levels based on numbering,
    moves unnumbered titles directly into the main dictionary, restructures
    level 1 keys, and processes nested levels dynamically.

    Args:
        input_dict (dict): A dictionary with titles and their associated content.

    Returns:
        dict: A nested dictionary organized by levels, with level 1 keys as new parents.
    """
    organized = defaultdict(dict)  # Dictionary to hold organized levels
    unnumbered = {}

    # Regex to extract numbering prefix
    numbering_pattern = re.compile(r'^(\d+(\.\d+)*\.)')  # Matches "X.", "X.X.", "X.X.X." at the start of a title

    for title, content in input_dict.items():
        # Match the numbering prefix
        match = numbering_pattern.match(title)

        if match:
            # Extract the numbering prefix
            numbering = match.group(1)  # e.g., "1.", "2.1.", "3.2.1."
            level = numbering.count('.')  # Determine the level based on the number of dots
            
            # Determine the parent numbering (if any)
            parent_numbering = '.'.join(numbering.rstrip('.').split('.')[:-1]) + '.' if '.' in numbering.rstrip('.') else None

            if level == 1:
                organized["level 1"][title] = content
            elif level == 2 and parent_numbering:
                organized[f"level 2 for {parent_numbering.rstrip('.')}"][title] = content
            elif level == 3 and parent_numbering:
                organized[f"level 3 for {parent_numbering.rstrip('.')}"][title] = content
        else:
            # Unnumbered titles
            unnumbered[title] = content

    # Move unnumbered titles directly into the main dictionary
    for title, content in unnumbered.items():
        organized[title] = content

    # Process level 1: Restructure based on content
    level_1 = organized.get("level 1", {})
    keys_to_remove = []
    for key, value in level_1.items():
        # Extract numbering prefix
        match = numbering_pattern.match(key)
        if match:
            numbering = match.group(1).rstrip('.')
            level_2_key = f"level 2 for {numbering}"
            
            # If blank, replace level 2 key
            if value == '':
                if level_2_key in organized:
                    organized[key] = organized.pop(level_2_key)
                keys_to_remove.append(key)
            else:
                # If non-blank, add 'introduction' to the level 2 structure
                if level_2_key in organized:
                    organized[key] = organized.pop(level_2_key)
                if key not in organized:
                    organized[key] = {}
                organized[key]["introduction"] = value
                keys_to_remove.append(key)

    # Remove processed keys from level 1
    for key in keys_to_remove:
        del organized["level 1"][key]

    # Remove empty "level 1" if no entries remain
    if not organized["level 1"]:
        del organized["level 1"]

    # Process level 3 for X.Y
    for level_3_key, level_3_content in list(organized.items()):
        if level_3_key.startswith("level 3 for"):
            parent_numbering = level_3_key.replace("level 3 for ", "")
            parent_key = next(
                (key for key in organized if key.startswith(f"level 2 for {parent_numbering.split('.')[0]}")),
                None
            )
            if parent_key:
                parent_dict = organized[parent_key]
                title_key = next(
                    (key for key in parent_dict if key.startswith(parent_numbering)),
                    None
                )
                if title_key:
                    # Add level 3 content to the parent level 2 dictionary
                    parent_dict[title_key] = {
                        **level_3_content,
                        "introduction": parent_dict[title_key]
                    }
                    # Remove level 3 key from organized
                    del organized[level_3_key]

    return dict(organized)

# Example usage
input_dict = {
    'Abstract': 'text',
    '1. Introduction': 'Welcome to the document',
    '2. General quality issues': 'Discussion of quality',
    '2.1. OnlineJobAdvertisements,VacanciesandCapacity': 'text',
    '2.2. Usingonlinejobadvertisementsforeconomicresearch': 'text',
    '3. Methods': '',
    '3.1. Basic settings': 'text',
    '3.2. Unitlevel methods': 'text',
    '3.2.1. Modelling misclassification': 'text',
    '3.2.2. Nonignorable selection:': 'text',
    '3.2.3. Single source capturerecapture': 'text'
}

result = organize_by_levels(input_dict)

import pprint
pprint.pprint(result)


In [None]:
import re
from collections import defaultdict

def organize_by_levels(input_dict):
    """
    Organizes titles and content into hierarchical levels based on numbering,
    moves unnumbered titles directly into the main dictionary, and restructures
    level 1 keys to either:
    - Replace level 2 key if they have blank values.
    - Include 'introduction' if they have non-blank values.

    Args:
        input_dict (dict): A dictionary with titles and their associated content.

    Returns:
        dict: A nested dictionary organized by levels, with level 1 keys as new parents.
    """
    organized = defaultdict(dict)  # Dictionary to hold organized levels
    unnumbered = {}

    # Regex to extract numbering prefix
    numbering_pattern = re.compile(r'^(\d+(\.\d+)*\.)')  # Matches "X.", "X.X.", "X.X.X." at the start of a title

    for title, content in input_dict.items():
        # Match the numbering prefix
        match = numbering_pattern.match(title)

        if match:
            # Extract the numbering prefix
            numbering = match.group(1)  # e.g., "1.", "2.1.", "3.2.1."
            level = numbering.count('.')  # Determine the level based on the number of dots
            
            # Determine the parent numbering (if any)
            parent_numbering = '.'.join(numbering.rstrip('.').split('.')[:-1]) + '.' if '.' in numbering.rstrip('.') else None

            if level == 1:
                organized["level 1"][title] = content
            elif level == 2 and parent_numbering:
                organized[f"level 2 for {parent_numbering.rstrip('.')}"][title] = content
            elif level == 3 and parent_numbering:
                organized[f"level 3 for {parent_numbering.rstrip('.')}"][title] = content
        else:
            # Unnumbered titles
            unnumbered[title] = content

    # Move unnumbered titles directly into the main dictionary
    for title, content in unnumbered.items():
        organized[title] = content

    # Process level 1: Restructure based on content
    level_1 = organized.get("level 1", {})
    keys_to_remove = []
    for key, value in level_1.items():
        # Extract numbering prefix
        match = numbering_pattern.match(key)
        if match:
            numbering = match.group(1).rstrip('.')
            level_2_key = f"level 2 for {numbering}"
            
            # If blank, replace level 2 key
            if value == '':
                if level_2_key in organized:
                    organized[key] = organized.pop(level_2_key)
                keys_to_remove.append(key)
            else:
                # If non-blank, add 'introduction' to the level 2 structure
                if level_2_key in organized:
                    organized[key] = organized.pop(level_2_key)
                if key not in organized:
                    organized[key] = {}
                organized[key]["introduction"] = value
                keys_to_remove.append(key)

    # Remove processed keys from level 1
    for key in keys_to_remove:
        del organized["level 1"][key]

    # Remove empty "level 1" if no entries remain
    if not organized["level 1"]:
        del organized["level 1"]

    return dict(organized)

# Example usage
input_dict = {
    'Abstract': 'text',
    '1. Introduction': 'Welcome to the document',
    '2. General quality issues': 'Discussion of quality',
    '2.1. OnlineJobAdvertisements,VacanciesandCapacity': 'text',
    '2.2. Usingonlinejobadvertisementsforeconomicresearch': 'text',
    '3. Methods': '',
    '3.1. Basic settings': 'text',
    '3.2. Unitlevel methods': 'text',
    '3.2.1. Modelling misclassification': 'text',
    '3.2.2. Nonignorable selection:': 'text',
    '3.2.3. Single source capturerecapture': 'text'
}

result = organize_by_levels(input_dict)

import pprint
pprint.pprint(result)


In [None]:
import re
from collections import defaultdict

def organize_by_levels_with_grouping(input_dict):
    """
    Organizes titles and content into hierarchical levels based on numbering,
    and groups them by parent levels dynamically.

    Args:
        input_dict (dict): A dictionary with titles and their associated content.

    Returns:
        dict: A nested dictionary organized by levels, with grouped sub-levels.
    """
    organized = defaultdict(dict)  # Dictionary to hold organized levels
    unnumbered = {}

    # Regex to extract numbering prefix
    numbering_pattern = re.compile(r'^(\d+(\.\d+)*\.)')  # Matches "X.", "X.X.", "X.X.X." at the start of a title

    for title, content in input_dict.items():
        # Match the numbering prefix
        match = numbering_pattern.match(title)

        if match:
            # Extract the numbering prefix
            numbering = match.group(1).rstrip('.')  # e.g., "1", "2.1", "3.2.1"
            parent_numbering = '.'.join(numbering.split('.')[:-1])  # Determine the parent numbering
            
            if parent_numbering:
                # Group under the parent level
                parent_level = numbering.count('.') + 1
                level_key = f"level {parent_level} for {parent_numbering}"
                organized[level_key][title] = content
            else:
                # Top-level entries
                organized["level 1"][title] = content
        else:
            # Unnumbered titles
            unnumbered[title] = content

    # Add unnumbered titles to 'level 0'
    if unnumbered:
        organized["level 0"] = unnumbered

    return dict(organized)

# Example usage
input_dict = {
    'Abstract': 'text',
    '1. Introduction': 'text',
    '2. General quality issues': 'text',
    '2.1. OnlineJobAdvertisements,VacanciesandCapacity': 'text',
    '2.2. Usingonlinejobadvertisementsforeconomicresearch': 'text',
    '3. Methods': '',
    '3.1. Basic settings': 'text',
    '3.2. Unitlevel methods': 'text',
    '3.2.1. Modelling misclassification': 'text',
    '3.2.2. Nonignorable selection:': 'text',
    '3.2.3. Single source capturerecapture': 'text'
}

result = organize_by_levels_with_grouping(input_dict)

import pprint
pprint.pprint(result)


In [78]:
result = organize_by_levels_with_grouping(test)


In [None]:
def restructure_levels(input_dict):
    """
    Restructure hierarchical levels in the dictionary based on parent numbering and titles.
    
    Args:
        input_dict (dict): A dictionary with hierarchical levels.
        
    Returns:
        dict: A new dictionary reorganized based on parent titles with recursive nesting.
    """
    new_dict = {}
    
    # Process level 0 directly
    if "level 0" in input_dict:
        for title, content in input_dict["level 0"].items():
            new_dict[title] = content

    # Process top-level keys (e.g., level 1)
    for key, value in input_dict.items():
        for i in range(1,5):
            if key.startswith("level {}".format(i)):
                for title, content in value.items():
                    new_dict[title] = content

    # Recursive processing for nested levels
    def add_nested_levels(nested_key, nested_dict):
        # Extract parent numbering (e.g., "X.Y")
        parent_key = nested_key.split("for")[-1].strip()
        parent_title = next((title for title in new_dict if title.startswith(parent_key)), None)

        if parent_title:
            # Ensure the parent entry is a dictionary
            if isinstance(new_dict[parent_title], str):
                new_dict[parent_title] = {"introduction": new_dict[parent_title]}
            
            # Add the nested dictionary to the appropriate parent
            for nested_title, nested_content in nested_dict.items():
                if isinstance(nested_content, dict):  # Handle deeper nesting
                    add_nested_levels(nested_title, nested_content)
                else:
                    # Add the nested content to the parent dictionary
                    new_dict[parent_title][nested_title] = nested_content

        else:
            # If no matching parent, process orphaned level

            if isinstance(nested_dict, dict):
                for nested_title, nested_content in nested_dict.items():
                    if isinstance(nested_content, dict):  # Handle deeper nesting
                        add_nested_levels(nested_title, nested_content)
                    else:
                        new_dict[nested_key] = nested_dict
            else:
                new_dict[nested_key] = nested_dict

    # Process all nested levels
    for key, sub_dict in input_dict.items():
        if "for" in key:
            add_nested_levels(key, sub_dict)

    return new_dict

# Example dictionary
input_dict = {
    'level 0': {'Abstract': 'text'},
    'level 1': {'1. Introduction': 'text',
                '2. General quality issues': 'text',
                '3. Methods': ''},
    'level 2 for 2': {'2.1. OnlineJobAdvertisements,VacanciesandCapacity': 'text',
                      '2.2. Usingonlinejobadvertisementsforeconomicresearch': 'text'},
    'level 2 for 3': {'3.1. Basic settings': 'text',
                      '3.2. Unitlevel methods': 'text'},
    'level 3 for 3.2': {'3.2.1. Modelling misclassification': 'text',
                        '3.2.2. Nonignorable selection:': 'text',
                        '3.2.3. Single source capturerecapture': 'text'}
}

# Restructure the dictionary
result = restructure_levels(input_dict)
pprint.pprint(result)


In [None]:
import re

def resolve_and_filter_dict(data):
    """
    Process the dictionary to:
    1. Replace string values with dictionary values as described previously.
    2. Filter the root-level keys based on specified rules:
       - Keep keys without numbering.
       - Keep keys with numbering in the format X. or X. Something (e.g., '1. Introduction', '3. Methods').

    Args:
        data (dict): Nested dictionary structure.

    Returns:
        dict: Processed and filtered dictionary.
    """
    def flatten_dict(d, flat_mapping):
        """
        Recursively flatten the dictionary to build a reference map of key-value pairs.
        """
        for key, value in d.items():
            if key in flat_mapping:
                # Only overwrite if the existing value is a string and the new one is a dict
                if isinstance(flat_mapping[key], str) and isinstance(value, dict):
                    flat_mapping[key] = value
            else:
                flat_mapping[key] = value
            
            if isinstance(value, dict):
                flatten_dict(value, flat_mapping)

    def replace_strings_with_dicts(d, flat_mapping):
        """
        Recursively replace string values with dictionary values if applicable.
        """
        for key, value in d.items():
            if isinstance(value, dict):
                replace_strings_with_dicts(value, flat_mapping)
            elif isinstance(value, str) and key in flat_mapping and isinstance(flat_mapping[key], dict):
                d[key] = flat_mapping[key]

    def filter_root_keys(d):
        """
        Filter the root-level keys to keep:
        - Keys without numbering.
        - Keys with numbering in the format X. or X. Something.
        """
        filtered_dict = {}
        for key, value in d.items():
            if not re.match(r'\d+(\.\d+)+', key):  # Exclude keys like X.Y., X.Y.Z., etc.
                if re.match(r'\d+\.$', key) or not re.match(r'\d+', key):  # Keep keys like X. or no numbering
                    filtered_dict[key] = value
                elif re.match(r'^\d+\..*$', key) and not re.match(r'\d+\.\d+\.', key):
                    filtered_dict[key] = value  # Include keys like "3. Methods" or "2. General quality issues"
        return filtered_dict

    # Step 1: Build a flat reference mapping
    reference_mapping = {}
    flatten_dict(data, reference_mapping)

    # Step 2: Replace strings with dicts based on the reference mapping
    replace_strings_with_dicts(data, reference_mapping)

    # Step 3: Filter root-level keys
    return filter_root_keys(data)

# Example usage
input_dict = {
    '1. Introduction': 'text',
    '2. General quality issues': {
        '2.1. OnlineJobAdvertisements,VacanciesandCapacity': 'text',
        '2.2. Usingonlinejobadvertisementsforeconomicresearch': 'text',
        'introduction': 'text'
    },
    '2.1. OnlineJobAdvertisements,VacanciesandCapacity': 'text',
    '2.2. Usingonlinejobadvertisementsforeconomicresearch': 'text',
    '3. Methods': {
        '3.1. Basic settings': 'text',
        '3.2. Unitlevel methods': 'text',
        'introduction': ''
    },
    '3.1. Basic settings': 'text',
    '3.2. Unitlevel methods': {
        '3.2.1. Modelling misclassification': 'text',
        '3.2.2. Nonignorable selection:': 'text',
        '3.2.3. Single source capturerecapture': 'text',
        'introduction': 'text'
    },
    '3.2.1. Modelling misclassification': 'text',
    '3.2.2. Nonignorable selection:': 'text',
    '3.2.3. Single source capturerecapture': 'text',
    'Abstract': 'text'
}

output_dict = resolve_and_filter_dict(input_dict)
pprint.pprint(output_dict)


In [None]:
import os

def list_pdfs_in_folder(folder_path):
    """
    Lists all PDF files in the specified folder.

    Args:
        folder_path (str): The path to the folder.

    Returns:
        list: A list of PDF file names in the folder.
    """
    try:
        # List all files in the folder and filter for PDFs
        pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]
        return pdf_files
    except FileNotFoundError:
        raise FileNotFoundError(f"The folder '{folder_path}' does not exist.")
    except Exception as e:
        raise RuntimeError(f"An error occurred while listing PDF files: {e}")

# Example usage
folder_path = "data/documents"
pdf_files = list_pdfs_in_folder(folder_path)
print("PDF files:", pdf_files)
