# La Estructura del Estado

> ...

Tipologia:
- Direcciones
- Institutos
- ...

## Getting Started: Environment Setup

In [9]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import requests
from rich import print
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import duckdb
import requests
from io import BytesIO
import pymupdf4llm
import pymupdf
import fs
import collections
import sys
import pathlib
import os
import dataclasses
import typing
import pydantic
import gzip

In [11]:
## Add the directory two levels above the current notebook's directory to the Python path.
## This allows importing modules located in the parent-parent directory, such as "scripts".

import sys
from pathlib import Path

path = Path().resolve().parent.parent
if str(path) not in sys.path:
    sys.path.append(str(path))

import scripts

## Compilationem

> Download, store, and convert from pdf to markdown (text).

In [4]:
institutionalClassifierURL  = "https://www.digepres.gob.do/wp-content/uploads/2024/08/Clasificador-Institucional.pdf"
mem_fs = fs.open_fs('mem://')
mem_fs.makedirs('datasets')

SubFS(MemoryFS(), '/datasets')

In [5]:
## Download and write the data to an in-memory file system.
response = requests.get(institutionalClassifierURL)
if response.status_code == 200:
    with mem_fs.open("datasets/Clasificador-Institucional.pdf", 'wb') as handler:
        handler.write(response.content)
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

In [6]:
## Convert the PDF data to Markdown text using PyMuPDF and pymupdf4llm.
with mem_fs.open("datasets/Clasificador-Institucional.pdf", "rb") as handler:
        in_memory_file = BytesIO(handler.read())
        doc  = pymupdf.open(stream  = in_memory_file, filetype="pdf")

text  =  pymupdf4llm.to_markdown(doc)

with mem_fs.open("datasets/Clasificador-Institucional.md", "w") as handler:
        handler.write(text)

Processing ...


## Processus 

> Parse the data, and generate a csv.

In [20]:
result = {
    "name" : "",
    "data" : []
}

# print(dir(scripts))
# print(scripts.dataset.Dataset)

data  =  scripts.dataset.Dataset()
data.name    = ""
data.records = ""
data.schema  = ""


In [8]:
with mem_fs.open("datasets/Clasificador-Institucional.md", "r") as handler:
        lines  = handler.readlines()

In [7]:
## Which lines constitute a 'state organization'?
## We have checked the PDF, and the lines that contain twelve '|' are the organizations I want to parse.
## collections.Counter([line.count("|") for line in lines])

In [4]:
## dataclass
@dataclasses.dataclass
class Organization:
    sector: str
    subsector: str
    area: str
    subarea: str
    seccion: str
    poderes: str
    entidad: str
    capitulo: str
    subcapitulo: str
    unidad_ejecutora: str
    denominacion: str

    @property
    def codigo(self) -> str:
        """Combine all relevant fields into a single string."""
        fields_to_combine = [
            self.sector, self.subsector, self.area, self.subarea,
            self.seccion, self.poderes, self.entidad, self.capitulo,
            self.subcapitulo, self.unidad_ejecutora
        ]
        return ".".join(filter(None, fields_to_combine))  # Only include non-empty fields
    
    @property
    def is_institution(self) -> bool:
        """
        Determines if the current instance represents an institution.

        An institution is identified by having exactly the specified number of 
        dot separators ('.') in its `codigo`.

        Returns:
            bool: True if `codigo` has the expected number of dot separators; 
            otherwise, False.
        """
        NUM_DOT_SEPARATORS_FOR_INSTITUTION = 9
        return self.codigo.count('.') == NUM_DOT_SEPARATORS_FOR_INSTITUTION

    def __repr__(self) -> str:
        """Custom string representation including combined fields."""
        return (f"Organization(sector={self.sector}, subsector={self.subsector}, area={self.area}, "
                f"subarea={self.subarea}, seccion={self.seccion}, poderes={self.poderes}, "
                f"entidad={self.entidad}, capitulo={self.capitulo}, subcapitulo={self.subcapitulo}, "
                f"unidad_ejecutora={self.unidad_ejecutora}, denominacion={self.denominacion}, "
                f"codigo={self.codigo})")

In [10]:
## Parsing
filtered_lines = [line for line in lines if line.count('|') == 12]
filtered_lines = filtered_lines[3:-1]
organizations = []

for line in filtered_lines:
    line = line.strip()[1:-1] ## Remove "|" from the start and end.
    result  = scripts.parse(line)
    

    organizations.append(Organization(**result))

## Data Set Generation

> Takes 'organizations' data; and create a new parquet dataset.

Needs:
- Store multiple datasets / 'json schemas'.
- Can have complex data (nested).
- Use gzip + json file with all the schemas for easy reading.
- Dataset can only be generated from a tag.

Tools:
- Gzip
- Apache Parquet vs Apache Avro vs Apache ORC  or JSON.
- ?

- https://www.gnu.org/software/gzip/manual/gzip.html
- https://avro.apache.org/
- https://www.json.org/json-en.html
- https://orc.apache.org/

In [None]:
# with gzip.open('result.log.gz', 'wb') as handler:
#     handler.writ4e("abc")

datasets = {
    name : "organizaciones",
    datos : [
        
    ]
}

In [24]:
## Schema

## /data
## index

# print(pydantic.TypeAdapter(Organization).json_schema())

# index  = {[
#     {
#         name: ..
#         schema: ...
#         data:  id
#     } 
# ]
# }

In [None]:
duckdb.sql("""
    SELECT *
    FROM dsorganismos
    where organismo like '%Reforma%'
    """)

┌────────────────────────────────────────────────────────────┬───────────────────────────────────────────┬─────────────┐
│                         Organismo                          │                 Tipología                 │   Sector    │
│                          varchar                           │                  varchar                  │   varchar   │
├────────────────────────────────────────────────────────────┼───────────────────────────────────────────┼─────────────┤
│ Comisión Permanente para la Reforma y Modernización de l…  │ Órgano Colegiado con Estructura Operativa │ Defensa     │
│ Fondo Patrimonial de las Empresas Reformadas (FONPER)      │ Organismo Descentralizado Funcionalmente  │ Presidencia │
└────────────────────────────────────────────────────────────┴───────────────────────────────────────────┴─────────────┘

## Referencias

- [Listado de Organismos](https://www.sismap.gob.do/GestionPublica/Organismos)

- [Informacin General de RD](https://www.presidencia.gob.do/acerca-de-rd/informacion-general)

- [Consulta de Organismos](https://map.gob.do/COEDOM/Home/Search)

- https://sismap.gob.do/

- https://www.sismap.gob.do/Municipal/

- [Clasificadores Presupuestarios](https://www.digepres.gob.do/publicaciones/clasificadores-presupuestarios/)

- [Organismos del Estado Dominicano](https://www.sismap.gob.do/Central/Home/About)

- [Clasificación de los Organismos de la Administración del Estado Dominicano](https://map.gob.do/COEDOM/Home/Clasificacion)

- [Miguel Collado: RD es el cuarto país de América Latina con mayor porcentaje de empleados públicos](https://www.diariolibre.com/economia/macroeconomia/2024/06/20/cuanto-gasta-el-gobierno-en-la-nomina-publica/2761319)