## Setup

In [1]:
from google import genai
from google.genai import types
import base64
import IPython
import json

from config.schema import schema_work_package_basic, schema_work_package_advanced
from config.system_prompt import system_prompt

In [2]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'mg-ce-demos'

In [3]:
REGION = "us-central1"
MODEL = "gemini-2.5-pro-preview-05-06" 
MODEL_FLASH = "gemini-2.5-flash-preview-05-20"
MODEL_2M = "gemini-1.5-pro-002"

### Setup gemini client

In [4]:
client = genai.Client(
    vertexai = True,
    project = PROJECT_ID,
    location = REGION,
)

In [5]:
generate_content_config = types.GenerateContentConfig(
    temperature = 0.5,
    top_p = 0.95,
    max_output_tokens = 8192,
    response_modalities = ["TEXT"],
    safety_settings = [types.SafetySetting(
        category="HARM_CATEGORY_HATE_SPEECH",
        threshold="OFF"
    ),types.SafetySetting(
        category="HARM_CATEGORY_DANGEROUS_CONTENT",
        threshold="OFF"
    ),types.SafetySetting(
        category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
        threshold="OFF"
    ),types.SafetySetting(
        category="HARM_CATEGORY_HARASSMENT",
        threshold="OFF"
    )]
)

## Data extraction and optimization use case

### Imports and variables

### Functions

In [5]:
def generate(prompt, pdf_file_path=None, model=MODEL):
    if pdf_file_path:
        pdf_file = types.Part.from_uri(
            file_uri=pdf_file_path,
            mime_type="application/pdf",
        )
    
        contents = [
            types.Content(
                role="user",
                parts=[
                    pdf_file,
                    types.Part.from_text(text=prompt)
                ]
            )
        ]
    else:
        contents = [
            types.Content(
                role="user",
                parts=[
                    types.Part.from_text(text=prompt)
                ]
            )
        ]

    token_count = client.models.count_tokens(
        model=MODEL,
        contents=contents,
    )
    print(f"This prompt has {token_count.total_tokens} input tokens")

    generate_content_config = types.GenerateContentConfig(
        temperature = 0.2,
        top_p = 1,
        seed = 0,
        max_output_tokens = 65535,
        response_modalities = ["TEXT"],
        response_mime_type="application/json",
        system_instruction = system_prompt,
        response_schema = schema_work_package_advanced,
        safety_settings = [types.SafetySetting(
            category="HARM_CATEGORY_HATE_SPEECH",
            threshold="OFF"
        ),types.SafetySetting(
            category="HARM_CATEGORY_DANGEROUS_CONTENT",
            threshold="OFF"
        ),types.SafetySetting(
            category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
            threshold="OFF"
        ),types.SafetySetting(
            category="HARM_CATEGORY_HARASSMENT",
            threshold="OFF"
        )],
    )

    response = client.models.generate_content(
        model = model,
        contents = contents,
        config = generate_content_config
    )

    return response

### Output refinement

In [6]:
prompt = "Review this document, and extract key elements and information.  Respond ONLY with a valid JSON object strictly conforming to the required schema."
ex_file_path = "gs://wec_demo_files/examples/idaho_nat_lab_work_package_ctc_example.pdf"

In [7]:
response = generate(prompt=prompt, pdf_file_path=ex_file_path, model=MODEL_FLASH)

This prompt has 5702 input tokens


In [11]:
json.loads(response.text)

{'document_metadata': {'document_id': 'INL/EXT-12-25847',
  'title': 'Work Breakdown Structure and Plant/Equipment Designation System Numbering Scheme for the High Temperature Gas-Cooled Reactor (HTGR) Component Test Capability (CTC)',
  'author': 'Jeffrey D. Bryan',
  'date': 'September 2009',
  'organization': 'Idaho National Laboratory'},
 'acronyms': [{'acronym': 'BEA',
   'definition': 'Battelle Energy Alliance, LLC'},
  {'acronym': 'BIM', 'definition': 'Building Information Modeling'},
  {'acronym': 'CMMS',
   'definition': 'Computerized Maintenance Management System'},
  {'acronym': 'CTC', 'definition': 'Component Test Capability'},
  {'acronym': 'DD&D',
   'definition': 'Deactivation, Decommissioning, and Dismantlement'},
  {'acronym': 'DOE', 'definition': 'Department of Energy'},
  {'acronym': 'DOE-ID',
   'definition': 'Department of Energy, Idaho Operations Office'},
  {'acronym': 'F&ORs',
   'definition': 'Functional and Operational Requirements'},
  {'acronym': 'HTGR', 'de

## Drawing conversion

In [38]:
file_name = "CAS-PLW-201_IFC-EXPORT.ifc"
file_path = "local/data/"

# Simple text reading
with open(file_path + file_name, 'r', encoding='utf-8') as f:
    content = f.read()
    

In [17]:
with open(file_path+f'{file_name}.txt', 'w') as f:
    f.write(content)

In [33]:
content_short = content[:1400000]

In [37]:
with open(file_path+f'{file_name}_short.txt', 'w') as f:
    f.write(content_short)

In [34]:
#content_short

In [39]:
prompt = f"""
Summarize this ifc (3D CAD) data.  Specifically, highlight the overall structure of what is represented in this data, including coordinates (x, y, z values) especially height or z-values.  Make sure to include the overall summary, but all each individual component, their orientation and positions.

ifc data:
{content}
"""

In [41]:
token_count = client.models.count_tokens(
    model=MODEL_FLASH,
    contents=prompt,
)
token_count

CountTokensResponse(total_tokens=304859, cached_content_token_count=None)

In [42]:
ifc_schema = {
  "description": "Summarizes key components and metadata of 3D CAD data from an IFC string.",
  "type": "OBJECT",
  "properties": {
    "projectMetadata": {
      "description": "Overall project identification and authorship information.",
      "type": "OBJECT",
      "properties": {
        "projectName": {
          "description": "Name of the overall project.",
          "type": "STRING"
        },
        "globalId": {
          "description": "Globally unique identifier for the project.",
          "type": "STRING"
        },
        "schemaVersion": {
          "description": "IFC schema version used (e.g., IFC4).",
          "type": "STRING"
        },
        "creationDate": {
          "description": "Date and time when the IFC file was created.",
          "type": "STRING",
          "format": "date-time"
        },
        "authoringTool": {
          "description": "Software used to author the IFC model.",
          "type": "STRING"
        },
        "organization": {
          "description": "Organization that authored the IFC model.",
          "type": "STRING"
        },
        "description": {
          "description": "General description of the project.",
          "type": "STRING"
        }
      },
      "required": ["projectName", "globalId", "schemaVersion"]
    },
    "overallSpatialPlacement": {
      "description": "High-level positioning and orientation of the model's site and main building.",
      "type": "OBJECT",
      "properties": {
        "site": {
          "description": "Geographical placement information for the site.",
          "type": "OBJECT",
          "properties": {
            "name": {
              "description": "Name of the site.",
              "type": "STRING"
            },
            "globalId": {
              "description": "Globally unique identifier for the site.",
              "type": "STRING"
            },
            "easting": {
              "description": "Easting coordinate of the site's origin.",
              "type": "NUMBER"
            },
            "northing": {
              "description": "Northing coordinate of the site's origin.",
              "type": "NUMBER"
            },
            "elevation": {
              "description": "Elevation of the site's origin.",
              "type": "NUMBER"
            },
            "trueNorthOrientation": {
              "description": "Angle in degrees from the Y-axis to true North.",
              "type": "NUMBER"
            }
          },
          "required": ["globalId"]
        },
        "building": {
          "description": "Main building's placement relative to the site.",
          "type": "OBJECT",
          "properties": {
            "name": {
              "description": "Name of the building.",
              "type": "STRING"
            },
            "globalId": {
              "description": "Globally unique identifier for the building.",
              "type": "STRING"
            },
            "x": {
              "description": "X-coordinate of the building's origin.",
              "type": "NUMBER"
            },
            "y": {
              "description": "Y-coordinate of the building's origin.",
              "type": "NUMBER"
            },
            "z": {
              "description": "Z-coordinate (elevation) of the building's origin.",
              "type": "NUMBER"
            },
            "rotationDegrees": {
              "description": "Rotation of the building in degrees around X, Y, and Z axes.",
              "type": "OBJECT",
              "properties": {
                "x": {
                  "type": "NUMBER"
                },
                "y": {
                  "type": "NUMBER"
                },
                "z": {
                  "type": "NUMBER"
                }
              }
            }
          },
          "required": ["globalId", "x", "y", "z"]
        }
      },
      "required": ["site", "building"]
    },
    "componentSummary": {
      "description": "Statistical overview and bounding volume of all components.",
      "type": "OBJECT",
      "properties": {
        "totalComponents": {
          "description": "Total number of individual components in the design.",
          "type": "INTEGER"
        },
        "componentTypes": {
          "description": "Breakdown of components by type with counts and an example GlobalId.",
          "type": "ARRAY",
          "items": {
            "type": "OBJECT",
            "properties": {
              "type": {
                "description": "IFC type of the component (e.g., IfcWall, IfcDoor).",
                "type": "STRING"
              },
              "count": {
                "description": "Number of components of this type.",
                "type": "INTEGER"
              },
              "exampleGlobalId": {
                "description": "An example GlobalId for a component of this type.",
                "type": "STRING"
              }
            },
            "required": ["type", "count"]
          }
        },
        "boundingVolume": {
          "description": "Overall bounding box encompassing all model components.",
          "type": "OBJECT",
          "properties": {
            "minX": {
              "type": "NUMBER"
            },
            "minY": {
              "type": "NUMBER"
            },
            "minZ": {
              "type": "NUMBER"
            },
            "maxX": {
              "type": "NUMBER"
            },
            "maxY": {
              "type": "NUMBER"
            },
            "maxZ": {
              "type": "NUMBER"
            }
          },
          "required": ["minX", "minY", "minZ", "maxX", "maxY", "maxZ"]
        }
      },
      "required": ["totalComponents", "componentTypes", "boundingVolume"]
    },
    "components": {
      "description": "A list of individual 3D CAD components with their key properties.",
      "type": "ARRAY",
      "items": {
        "type": "OBJECT",
        "properties": {
          "globalId": {
            "description": "Globally unique identifier for the component.",
            "type": "STRING"
          },
          "name": {
            "description": "Name or common identifier of the component.",
            "type": "STRING"
          },
          "type": {
            "description": "IFC type of the component (e.g., IfcWall, IfcDoor).",
            "type": "STRING"
          },
          "storey": {
            "description": "The building storey/level to which the component belongs.",
            "type": "STRING"
          },
          "material": {
            "description": "Primary material assigned to the component.",
            "type": "STRING"
          },
          "x": {
            "description": "X-coordinate of the component's origin.",
            "type": "NUMBER"
          },
          "y": {
            "description": "Y-coordinate of the component's origin.",
            "type": "NUMBER"
          },
          "z": {
            "description": "Z-coordinate (elevation) of the component's origin.",
            "type": "NUMBER"
          },
          "rotationDegrees": {
            "description": "Rotation of the component in degrees around X, Y, and Z axes.",
            "type": "OBJECT",
            "properties": {
              "x": {
                "type": "NUMBER"
              },
              "y": {
                "type": "NUMBER"
              },
              "z": {
                "type": "NUMBER"
              }
            }
          },
          "dimensions": {
            "description": "Approximate overall dimensions of the component.",
            "type": "OBJECT",
            "properties": {
              "length": {
                "type": "NUMBER"
              },
              "width": {
                "type": "NUMBER"
              },
              "height": {
                "type": "NUMBER"
              }
            }
          }
        },
        "required": ["globalId", "name", "type", "x", "y", "z"]
      }
    }
  },
  "required": ["projectMetadata", "overallSpatialPlacement", "componentSummary", "components"]
}

generate_content_config_structured = types.GenerateContentConfig(
    temperature = 0.2,
    top_p = 1,
    seed = 0,
    max_output_tokens = 65535,
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
    response_mime_type = "application/json",
    response_schema = ifc_schema
)

In [43]:
response = client.models.generate_content(
    model=MODEL, 
    contents=prompt,
    config=generate_content_config_structured
)

In [47]:
print(response.text)

{
  "projectMetadata": {
    "projectName": "Default Project",
    "globalId": "0JIjZQEQr3UBrcXE1YEtkD",
    "schemaVersion": "IFC2X3",
    "creationDate": "2025-05-29T14:08:40Z",
    "authoringTool": "AVEVA E3D Design Design Mk3.1.7.2[Z3172-003]  (WINDOWS-NT 0.0)  (21 Feb 2024 : 14:44)",
    "organization": "AVEVA Solutions Limited",
    "description": "Long Name"
  },
  "overallSpatialPlacement": {
    "site": {
      "name": "/WECTEST",
      "globalId": "2njrCuVVP4fhA$AtiARt8X",
      "easting": 0.0,
      "northing": 0.0,
      "elevation": 0.0,
      "trueNorthOrientation": 0.0
    },
    "building": {
      "name": "TEST_BLB",
      "globalId": "3pNZNtcv13sAgbd2LvvIJF",
      "x": 0.0,
      "y": 0.0,
      "z": 0.0,
      "rotationDegrees": {
        "x": 0.0,
        "y": 0.0,
        "z": 0.0
      }
    }
  },
  "componentSummary": {
    "totalComponents": 33,
    "componentTypes": [
      {
        "type": "IFCFLOWFITTING",
        "count": 15,
        "exampleGlobalId": "3

In [13]:
import re

def chunk_ifc_data(ifc_data_string: str) -> dict:
    """
    Logically chunks IFC data into the header block and individual entity blocks.

    Args:
        ifc_data_string (str): The full IFC data as a single string.

    Returns:
        dict: A dictionary containing:
            'header_block': A string representing the complete HEADER section.
            'data_entities': A list of strings, where each string is a single
                             IFC entity definition (e.g., '#1= IFCORGANIZATION(...)').
                             Empty lines and section delimiters are excluded.
    """
    header_lines = []
    data_entity_lines = []
    
    in_header_section = False
    in_data_section = False

    lines = ifc_data_string.splitlines()

    for line in lines:
        stripped_line = line.strip()

        if not stripped_line: # Skip empty lines
            continue

        if stripped_line == "HEADER;":
            in_header_section = True
            in_data_section = False # Ensure data section is off
            header_lines.append(line)
            continue
        elif stripped_line == "DATA;":
            in_data_section = True
            in_header_section = False # Ensure header section is off
            # The 'DATA;' line itself is a section delimiter, not an entity
            continue
        elif stripped_line == "ENDSEC;":
            if in_header_section:
                header_lines.append(line)
            # This ENDSEC marks the end of the current section
            in_header_section = False
            in_data_section = False
            continue

        if in_header_section:
            header_lines.append(line)
        elif in_data_section:
            # Each line starting with '#' and containing '=' is an entity definition
            if stripped_line.startswith('#') and '=' in stripped_line:
                data_entity_lines.append(stripped_line)
            # You might encounter lines that are not entities within DATA section
            # (e.g., comments, but less common in raw output like this).
            # The current regex for entities will filter them out.

    return {
        'header_block': "\n".join(header_lines).strip(),
        'data_entities': data_entity_lines
    }

def parse_ifc_entity_line(entity_line: str) -> tuple[int | None, str | None, str | None]:
    """
    Parses a single IFC entity line into its ID, type, and raw parameters string.

    Args:
        entity_line (str): A string representing a single IFC entity, e.g.,
                           '#1= IFCORGANIZATION($,'AVEVA Solutions Limited',$,$,$);'

    Returns:
        tuple[int | None, str | None, str | None]: A tuple containing:
            - entity_id (int or None): The integer ID of the entity (e.g., 1).
            - entity_type (str or None): The type of the IFC entity (e.g., 'IFCORGANIZATION').
            - params_str (str or None): The raw string of parameters within the parentheses.
            Returns (None, None, None) if the line cannot be parsed.
    """
    match = re.match(r'^#(\d+)=\s*([A-Z_]+)\((.*)\);?$', entity_line)
    if match:
        entity_id = int(match.group(1))
        entity_type = match.group(2)
        params_str = match.group(3)
        return entity_id, entity_type, params_str
    return None, None, None

def extract_coordinates_from_ifc_data(ifc_data_string: str) -> dict:
    """
    Extracts Cartesian point coordinates and relevant property positions from IFC data.

    Args:
        ifc_data_string (str): The full IFC data as a string.

    Returns:
        dict: A dictionary containing:
            'cartesian_points': {entity_id: (x, y, z)} for IFCCARTESIANPOINT entities.
            'property_positions': {entity_id: (x, y, z)} for entities with 'POS' or 'HPOS'/'TPOS' properties.
    """
    chunked_data = chunk_ifc_data(ifc_data_string)
    data_entities = chunked_data['data_entities']

    cartesian_points = {}
    property_positions = {}

    # First pass: Collect all IFCCARTESIANPOINTs
    for entity_line in data_entities:
        entity_id, entity_type, params_str = parse_ifc_entity_line(entity_line)
        if entity_id is None:
            continue

        if entity_type == 'IFCCARTESIANPOINT':
            # Regex to capture three float values within parentheses
            coords_match = re.search(r'\(([-+]?\d+\.\d+e?[-+]?\d*),([-+]?\d+\.\d+e?[-+]?\d*),([-+]?\d+\.\d+e?[-+]?\d*)\)', params_str)
            if coords_match:
                x = float(coords_match.group(1))
                y = float(coords_match.group(2))
                z = float(coords_match.group(3))
                cartesian_points[entity_id] = (x, y, z)
        
        # Second pass (or combined): Identify property sets with position data
        # This is more complex as it requires parsing nested parameters.
        # For simplicity, we'll look for specific patterns for 'POS', 'HPOS', 'TPOS'
        # in IFCPROPERTYLISTVALUE within IFCPROPERTYSINGLEVALUE or IFCPROPERTYLISTVALUE
        # within IFCPROPERTYSET. This is a simplified regex approach.
        # A robust solution would require a full STEP parser or an IFC library.
        if entity_type == 'IFCPROPERTYSET':
            # Look for properties like 'POS', 'HPOS', 'TPOS' inside the property set
            # This regex is simplified and might not catch all variations or nested structures
            pos_match = re.search(r"'(POS|HPOS|TPOS)',[^,]*,\(IFCLENGTHMEASURE\(([-+]?\d+\.\d+e?[-+]?\d*)\),IFCLENGTHMEASURE\(([-+]?\d+\.\d+e?[-+]?\d*)\),IFCLENGTHMEASURE\(([-+]?\d+\.\d+e?[-+]?\d*)\)\)", params_str)
            if pos_match:
                # The entity_id here is the ID of the IFCPROPERTYSET, not the element it describes.
                # To link it to the actual element, you'd need to parse IFCPROPERTIESBYRELATIONSHIP
                # or similar relationships. For this function, we'll store it by the property set ID.
                x = float(pos_match.group(2))
                y = float(pos_match.group(3))
                z = float(pos_match.group(4))
                property_positions[entity_id] = (x, y, z)
            
            # Another common pattern for properties directly in IFCPROPERTYSINGLEVALUE
            # This specific data doesn't have it directly like this, but it's good to be aware.
            # For example, if a line was: #X= IFCPROPERTYSINGLEVALUE('POS',$,IFCLENGTHMEASURE(1.0),IFCLENGTHMEASURE(2.0),IFCLENGTHMEASURE(3.0),$);
            # This data uses IFCPROPERTYLISTVALUE for POS/HPOS/TPOS, handled above.

    return {
        'cartesian_points': cartesian_points,
        'property_positions': property_positions
    }


In [14]:
ifc_data = content

chunked_result = chunk_ifc_data(ifc_data)
print("--- HEADER BLOCK ---")
print(chunked_result['header_block'])
print("\n--- FIRST 5 DATA ENTITIES ---")
for i, entity in enumerate(chunked_result['data_entities'][:5]):
    print(entity)
print("\n--- LAST 5 DATA ENTITIES ---")
for i, entity in enumerate(chunked_result['data_entities'][-5:]):
    print(entity)
extracted_coords = extract_coordinates_from_ifc_data(ifc_data)
print("\n--- EXTRACTED CARTESIAN POINTS (first 5) ---")
for entity_id, coords in list(extracted_coords['cartesian_points'].items())[:5]:
    print(f"ID #{entity_id}: {coords}")
print("\n--- EXTRACTED PROPERTY POSITIONS (first 5) ---")
for entity_id, coords in list(extracted_coords['property_positions'].items())[:5]:
    print(f"PropertySet ID #{entity_id}: {coords}")

--- HEADER BLOCK ---
HEADER;
FILE_DESCRIPTION(('ViewDefinition [CoordinationView_V2.0]'),'2;1');
FILE_NAME('C:/Users/Public/Documents/AVEVA/USERDATA/FPS-PLW-582_IFC-EXPORT.ifc','2025-05-29T14:11:03',('bakal1bl@CRWV10-WENGB046'),(''),'','AVEVA E3D Design Design Mk3.1.7.2[Z3172-003]  (WINDOWS-NT 0.0)  (21 Feb 2024 : 14:44)','');
FILE_SCHEMA(('IFC2X3'));
ENDSEC;

--- FIRST 5 DATA ENTITIES ---
#1= IFCORGANIZATION($,'AVEVA Solutions Limited',$,$,$);
#2= IFCAPPLICATION(#1,'AVEVA E3D Design Design Mk3.1.7.2[Z3172-003]  (WINDOWS-NT 0.0)  (21 Feb 2024 : 14:44)','AVEVA E3D Design','AVEVA E3D Design');
#3= IFCPERSON($,'Undefined','Undefined',$,$,$,$,$);
#4= IFCORGANIZATION($,'Undefined',$,$,$);
#5= IFCPERSONANDORGANIZATION(#3,#4,$);

--- LAST 5 DATA ENTITIES ---
#57300= IFCLOCALPLACEMENT(#57298,#57236);
#57301= IFCLOCALPLACEMENT(#57298,#57236);
#57302= IFCLOCALPLACEMENT(#57298,#57236);
#57303= IFCLOCALPLACEMENT(#57298,#57236);
#57304= IFCLOCALPLACEMENT(#57298,#57236);

--- EXTRACTED CARTESIAN POI

In [15]:
chunked_result.keys()

dict_keys(['header_block', 'data_entities'])

In [16]:
# Write to file
with open(f'local/data/{file_name}_data.json', 'w') as f:
    json.dump(chunked_result, f)

In [110]:
chunked_result['header_block']

"HEADER;\nFILE_DESCRIPTION(('ViewDefinition [CoordinationView_V2.0]'),'2;1');\nFILE_NAME('C:/Users/Public/Documents/AVEVA/USERDATA/FPS-PLW-582_IFC-EXPORT.ifc','2025-05-29T14:11:03',('bakal1bl@CRWV10-WENGB046'),(''),'','AVEVA E3D Design Design Mk3.1.7.2[Z3172-003]  (WINDOWS-NT 0.0)  (21 Feb 2024 : 14:44)','');\nFILE_SCHEMA(('IFC2X3'));\nENDSEC;"

In [147]:
#chunked_result['data_entities'][200:300]

In [113]:
extracted_coords.keys()

dict_keys(['cartesian_points', 'property_positions'])

In [114]:
extracted_coords['property_positions']

{}

In [160]:
import re

def filter_critical_ifc_data(ifc_data_string, return_stats=False):
    """
    Filters IFC data string to keep only critical data elements.
    
    Args:
        ifc_data_string (str): Raw IFC data string
        return_stats (bool): If True, returns tuple with (critical_elements, stats_dict)
        
    Returns:
        list or tuple: List of critical IFC elements, or tuple with elements and stats
    """
    
    # Define critical IFC entity types
    critical_entities = {
        # Spatial structure
        'IFCPROJECT',
        'IFCSITE', 
        'IFCBUILDING',
        'IFCBUILDINGSTOREY',
        'IFCSPACE',
        
        # Spatial relationships
        'IFCRELCONTAINEDINSPATIALSTRUCTURE',
        'IFCRELAGGREGATES',
        
        # Product entities (building components)
        'IFCWALL',
        'IFCSLAB', 
        'IFCBEAM',
        'IFCCOLUMN',
        'IFCDOOR',
        'IFCWINDOW',
        'IFCFLOWFITTING',
        'IFCFLOWSEGMENT',
        'IFCFLOWTERMINAL',
        'IFCFURNISHINGELEMENT',
        'IFCMECHANICALFASTENER',
        'IFCRAILING',
        'IFCRAMP',
        'IFCRAMPFLIGHT',
        'IFCROOF',
        'IFCSTAIR',
        'IFCSTAIRFLIGHT',
        
        # Geometric representation context
        'IFCGEOMETRICREPRESENTATIONCONTEXT',
        'IFCGEOMETRICREPRESENTATIONSUBCONTEXT',
        'IFCUNITASSIGNMENT',
        
        # Placement and orientation
        'IFCLOCALPLACEMENT',
        'IFCCARTESIANPOINT',
        'IFCDIRECTION',
        'IFCAXIS2PLACEMENT3D',
        
        # Shape representation
        'IFCPRODUCTDEFINITIONSHAPE',
        'IFCSHAPEREPRESENTATION',
        
        # Solid geometry
        'IFCEXTRUDEDAREASOLID',
        'IFCBOOLEANCLIPPINGRESULT',
        'IFCCARTESIANPOINTLIST3D',
        'IFCPOLYLOOP',
        'IFCFACEOUTERBOUND',
        'IFCFACE',
        'IFCCLOSEDSHELL',
        'IFCFACETEDBREP',
        
        # Material identification
        'IFCMATERIAL',
        'IFCRELASSOCIATESMATERIAL',
        'IFCMATERIALLAYER',
        'IFCMATERIALLAYERSET',
        'IFCMATERIALLAYERSETUSAGE',
        
        # Property sets
        'IFCPROPERTYSET',
        'IFCPROPERTYSINGLEVALUE',
        'IFCPROPERTYLISTVALUE',
        'IFCRELDEFINESBYPROPERTIES'
    }
    
    # Find all elements that start with # and end with ;
    # Pattern matches: #number= followed by anything until ;
    pattern = r'#\d+=[^;]*;'
    all_elements = re.findall(pattern, ifc_data_string)
    total_elements = len(all_elements)
    
    critical_elements = []
    
    for element in all_elements:
        # Check if element contains any critical entity type
        element_upper = element.upper()
        
        for entity_type in critical_entities:
            if entity_type in element_upper:
                critical_elements.append(element)
                break  # Found a match, no need to check other entity types
    
    # Calculate statistics
    critical_count = len(critical_elements)
    removed_count = total_elements - critical_count
    removal_percentage = (removed_count / total_elements * 100) if total_elements > 0 else 0
    
    if return_stats:
        stats = {
            'total_elements': total_elements,
            'critical_elements': critical_count,
            'removed_elements': removed_count,
            'removal_percentage': removal_percentage
        }
        return critical_elements, stats
    
    return critical_elements

def print_critical_elements(critical_elements, max_display=10):
    """
    Print the critical elements in a readable format.
    
    Args:
        critical_elements (list): List of critical IFC elements
        max_display (int): Maximum number of elements to display
    """
    print(f"Found {len(critical_elements)} critical elements:")
    print("-" * 50)
    
    for i, element in enumerate(critical_elements[:max_display]):
        print(f"{i+1}. {element}")
    
    if len(critical_elements) > max_display:
        print(f"... and {len(critical_elements) - max_display} more elements")

def print_filtering_stats(stats):
    """
    Print filtering statistics in a readable format.
    
    Args:
        stats (dict): Statistics dictionary from filter_critical_ifc_data
    """
    print("\n" + "="*60)
    print("IFC FILTERING STATISTICS")
    print("="*60)
    print(f"Total elements in original data:  {stats['total_elements']:,}")
    print(f"Critical elements kept:           {stats['critical_elements']:,}")
    print(f"Elements removed:                 {stats['removed_elements']:,}")
    print(f"Removal percentage:               {stats['removal_percentage']:.1f}%")
    print(f"Data reduction factor:            {stats['total_elements']/stats['critical_elements']:.1f}x" if stats['critical_elements'] > 0 else "Data reduction factor:            N/A")
    print("="*60)


    

In [161]:
print("METHOD 2: Filtering with statistics")
critical_data_with_stats, stats = filter_critical_ifc_data(ifc_data, return_stats=True)

# Display the filtering statistics
print_filtering_stats(stats)


METHOD 2: Filtering with statistics

IFC FILTERING STATISTICS
Total elements in original data:  57,304
Critical elements kept:           56,940
Elements removed:                 364
Removal percentage:               0.6%
Data reduction factor:            1.0x
