cordada · svillegas-cdd · Sep 22, 2025 · Sep 12, 2025 · Sep 16, 2025 · Sep 16, 2025
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.57.0
+current_version = 0.58.0
 commit = True
 tag = False
 message = chore: Bump version from {current_version} to {new_version}

@@ -1,5 +1,11 @@
 # History
 
+## 0.58.0 (2025-09-22)
+
+- (PR #890, 2025-09-16) cte: Add parser for "Datos del Contribuyente"
+- (PR #896, 2025-09-16) rut: Add regex for canonical RUT that is compatible with JSON Schema
+- (PR #895, 2025-09-16) cte: Add parser for "Propiedades y Bienes Raíces"
+
 ## 0.57.0 (2025-09-15)
 
 - (PR #888, 2025-09-10) tests: Refactor and improve constants tests

@@ -4,4 +4,4 @@
 
 """
 
-__version__ = '0.57.0'
+__version__ = '0.58.0'
@@ -1,6 +1,9 @@
 from __future__ import annotations
 
 from collections.abc import Sequence
+from datetime import date
+from decimal import Decimal
+from typing import Optional
 
 import pydantic
 
@@ -38,3 +41,135 @@ class LegalRepresentative:
     """
     Fecha de incorporación.
     """
+
+
+@pydantic.dataclasses.dataclass(
+    frozen=True,
+    config=pydantic.ConfigDict(
+        arbitrary_types_allowed=True,
+        extra='forbid',
+    ),
+)
+class TaxpayerData:
+    start_of_activities_date: Optional[date]
+    """
+    Fecha de inicio de actividades.
+    """
+    economic_activities: str
+    """
+    Actividades Económicas
+    """
+    tax_category: str
+    """
+    Categoría Tributaria
+    """
+    address: str
+    """
+    Domicilio
+    """
+    branches: Sequence[str]
+    """
+    Sucursales
+    """
+    last_filed_documents: Sequence[LastFiledDocument]
+    """
+    Últimos documentos timbrados
+    """
+    tax_observations: Optional[str] = None
+    """
+    Observaciones tributarias
+    """
+
+
+@pydantic.dataclasses.dataclass(
+    frozen=True,
+)
+class LastFiledDocument:
+    name: str
+    date: date
+
+
+@pydantic.dataclasses.dataclass(
+    frozen=True,
+    config=pydantic.ConfigDict(
+        arbitrary_types_allowed=True,
+        extra='forbid',
+    ),
+)
+class TaxpayerProperties:
+    """
+    Propiedades y Bienes Raíces (3)
+    """
+
+    properties: Sequence[Property]
+
+
+@pydantic.dataclasses.dataclass(
+    frozen=True,
+)
+class Property:
+    commune: Optional[str]
+    """
+    Comuna
+    """
+    role: Optional[str]
+    """
+    Rol
+    """
+    address: Optional[str]
+    """
+    Dirección
+    """
+    purpose: Optional[str]
+    """
+    Destino
+    """
+    fiscal_valuation: Optional[Decimal]
+    """
+    Avalúo Fiscal
+    """
+    overdue_installments: Optional[bool]
+    """
+    Cuotas vencidas por pagar
+    """
+    current_installments: Optional[bool]
+    """
+    Cuotas vigentes por pagar
+    """
+    condition: Optional[str]
+    """
+    Condición
+    """
+
+    ###########################################################################
+    # Validators
+    ###########################################################################
+
+    @pydantic.field_validator('fiscal_valuation', mode='before')
+    @classmethod
+    def parse_fiscal_valuation(cls, v: Optional[str]) -> Optional[Decimal]:
+        if isinstance(v, str):
+            v = v.replace('.', '').replace(',', '.')
+            return Decimal(v)
+        return v
+
+    @pydantic.field_validator('commune', 'role', 'address', 'purpose', 'condition')
+    @classmethod
+    def parse_str_fields(cls, v: Optional[str]) -> Optional[str]:
+        if isinstance(v, str) and not v.strip():
+            return None
+        return v
+
+    @pydantic.field_validator('current_installments', 'overdue_installments', mode='before')
+    @classmethod
+    def parse_boolean_fields(cls, v: Optional[str | bool]) -> Optional[bool]:
+        if isinstance(v, str):
+            if v == 'NO':
+                return False
+            elif v == 'SI':
+                return True
+            else:
+                return None
+        if isinstance(v, bool):
+            return v
+        return None
@@ -1,8 +1,17 @@
 from __future__ import annotations
 
+from datetime import datetime
+
 from bs4 import BeautifulSoup
 
-from .data_models import LegalRepresentative, TaxpayerProvidedInfo
+from .data_models import (
+    LastFiledDocument,
+    LegalRepresentative,
+    Property,
+    TaxpayerData,
+    TaxpayerProperties,
+    TaxpayerProvidedInfo,
+)
 
 
 def parse_taxpayer_provided_info(html_content: str) -> TaxpayerProvidedInfo:
@@ -89,3 +98,139 @@ def parse_taxpayer_provided_info(html_content: str) -> TaxpayerProvidedInfo:
         company_formation=company_formation,
         participation_in_existing_companies=participation_in_companies,
     )
+
+
+def parse_taxpayer_data(html_content: str) -> TaxpayerData:
+    """
+    Parse the CTE HTML content to extract the content of the section:
+    "Datos del Contribuyente"
+
+    Args:
+        html_content: HTML string containing the taxpayer information table
+
+    Returns:
+        TaxpayerData instance with the parsed data
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    table = soup.find('table', id='tbl_dbcontribuyente')
+    if not table:
+        raise ValueError("Could not find 'Datos del Contribuyente' table in HTML")
+
+    fecha_inicio_elem = table.find(id='td_fecha_inicio')  # type: ignore[attr-defined]
+    if fecha_inicio_elem:
+        start_of_activities_date = (
+            datetime.strptime(fecha_inicio_elem.get_text(strip=True), "%d-%m-%Y").date()
+            if fecha_inicio_elem.get_text(strip=True)
+            else None
+        )
+    else:
+        start_of_activities_date = None
+
+    actividades_elem = table.find(id='td_actividades')  # type: ignore[attr-defined]
+    if actividades_elem:
+        economic_activities = actividades_elem.get_text(separator="\n", strip=True)
+    else:
+        economic_activities = ""
+
+    categoria_elem = table.find(id='td_categoria')  # type: ignore[attr-defined]
+    if categoria_elem:
+        tax_category = categoria_elem.get_text(strip=True)
+    else:
+        tax_category = ""
+
+    domicilio_elem = table.find(id='td_domicilio')  # type: ignore[attr-defined]
+    if domicilio_elem:
+        address = domicilio_elem.get_text(strip=True)
+    else:
+        address = ""
+
+    # Sucursales
+    branches = []
+    sucursales_row = table.find(  # type: ignore[attr-defined]
+        'td',
+        string=lambda s: s and 'Sucursales:' in s,
+    )
+    if sucursales_row:
+        sucursales_td = sucursales_row.find_next_sibling('td')
+        if sucursales_td:
+            branches_text = sucursales_td.get_text(separator="\n", strip=True)
+            branches = [b for b in branches_text.split("\n") if b]
+
+    # Últimos documentos timbrados
+    last_filed_documents = []
+    tim_nombre_elem = table.find(id='td_tim_nombre')  # type: ignore[attr-defined]
+    tim_fecha_elem = table.find(id='td_tim_fecha')  # type: ignore[attr-defined]
+    if tim_nombre_elem and tim_fecha_elem:
+        names = tim_nombre_elem.get_text(separator="\n", strip=True).split("\n")
+        dates = tim_fecha_elem.get_text(separator="\n", strip=True).split("\n")
+        for name, date_str in zip(names, dates):
+            if name and date_str:
+                doc_date = datetime.strptime(date_str, "%d-%m-%Y").date()
+                last_filed_documents.append(LastFiledDocument(name=name, date=doc_date))
+
+    # Observaciones tributarias
+    tax_observations = None
+    observaciones_elem = table.find(id='td_observaciones')  # type: ignore[attr-defined]
+    if observaciones_elem:
+        tax_observations = observaciones_elem.get_text(strip=True)
+
+    return TaxpayerData(
+        start_of_activities_date=start_of_activities_date,
+        economic_activities=economic_activities,
+        tax_category=tax_category,
+        address=address,
+        branches=branches,
+        last_filed_documents=last_filed_documents,
+        tax_observations=tax_observations,
+    )
+
+
+def parse_taxpayer_properties(html_content: str) -> TaxpayerProperties:
+    """
+    Parse the CTE HTML content to extract the content of the section:
+    "Propiedades y Bienes Raíces (3)"
+
+    Args:
+        html_content: HTML string containing the taxpayer properties table
+
+    Returns:
+        TaxpayerProperties instance with the parsed data
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    # Find the main table with id="tbl_propiedades"
+    table = soup.find('table', id='tbl_propiedades')
+    if not table:
+        raise ValueError("Could not find taxpayer information table in HTML")
+
+    properties = []
+    rows = table.find_all('tr')  # type: ignore[attr-defined]
+    for row in rows[2:]:  # Skip headers rows
+
+        # Skip rows without useful data
+        cells = row.find_all('td')
+        if len(cells) < 8:
+            continue
+
+        commune = cells[0].get_text(strip=True) or None
+        role = cells[1].get_text(strip=True) or None
+        address = cells[2].get_text(strip=True) or None
+        purpose = cells[3].get_text(strip=True) or None
+        fiscal_valuation = cells[4].get_text(strip=True) or None
+        overdue_installments = cells[5].get_text(strip=True) or None
+        current_installments = cells[6].get_text(strip=True) or None
+        condition = cells[7].get_text(strip=True) or None
+
+        properties.append(
+            Property(
+                commune=commune,
+                role=role,
+                address=address,
+                purpose=purpose,
+                fiscal_valuation=fiscal_valuation,
+                overdue_installments=overdue_installments,
+                current_installments=current_installments,
+                condition=condition,
+            )
+        )
+    return TaxpayerProperties(properties=properties)
@@ -4,7 +4,6 @@
 
 from __future__ import annotations
 
-import re
 import sys
 from typing import Any, ClassVar, Pattern
 
@@ -79,12 +78,8 @@ class _RutPydanticAnnotation:
     >>> example_json_schema = example_type_adapter.json_schema()
     """
 
-    RUT_CANONICAL_STRICT_REGEX: ClassVar[Pattern] = re.compile(
-        re.sub(
-            pattern=r'\?P<\w+>',
-            repl='',
-            string=cl_sii.rut.constants.RUT_CANONICAL_STRICT_REGEX.pattern,
-        )
+    RUT_CANONICAL_STRICT_REGEX: ClassVar[Pattern] = (
+        cl_sii.rut.constants.RUT_CANONICAL_STRICT_JSON_SCHEMA_REGEX
     )
     """
     RUT (strict) regex for canonical format, without named groups.

@@ -7,6 +7,7 @@
 """
 
 import re
+from typing import Pattern
 
 import cryptography.x509
 
@@ -22,6 +23,17 @@
 RUT_DIGITS_MIN_VALUE = 1
 """RUT digits min value."""
 
+RUT_CANONICAL_STRICT_JSON_SCHEMA_REGEX: Pattern[str] = re.compile("^(\\d{1,8})-([\\dK])$")
+"""
+RUT (strict) JSON Schema regex for canonical format.
+
+This regex is compatible with JSON Schema and OpenAPI, which use the regular expression syntax from
+JavaScript (ECMA 262), which does not support Python’s named groups.
+
+.. tip:: If you need the regex as a string, for example to use it in a JSON Schema or
+    OpenAPI schema, use ``RUT_CANONICAL_STRICT_JSON_SCHEMA_REGEX.pattern``.
+"""
+
 SII_CERT_TITULAR_RUT_OID = cryptography.x509.oid.ObjectIdentifier("1.3.6.1.4.1.8321.1")
 """OID of the RUT of the certificate holder"""
 # - Organismo: MINISTERIO DE ECONOMÍA / SUBSECRETARIA DE ECONOMIA
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,4 +4,4 @@

		"""

		__version__ = '0.57.0'
		__version__ = '0.58.0'