## Autores

| Nome | nUSP |
| :--- | :--- |
| Guilherme de Abreu Barreto | 12543033 |
| Lucas Eduardo Gulka Pulcinelli | 12547336 |
| Vinicio Yusuke Hayashibara | 13642797 |

In [1]:
DEFAULT_DATABASE = "postgres"
CENSO_DATABASE = "censo2022"
USER = "postgres"
PASSWORD = "postgres"
HOST = "localhost"
PORT = 5432
URI = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/"

In [11]:
import pandas as pd
from enum import Enum
from math import sqrt
from sqlalchemy import (
    BigInteger,
    Float,
    Integer,
    Index,
    String,
    CheckConstraint as constraint,
    UniqueConstraint as unique,
    PrimaryKeyConstraint as pkc,
    ForeignKeyConstraint as fkc,
    ForeignKey as fk,
    JSON,
    cast,
    create_engine,
    insert,
    text,
    func,
)
from sqlalchemy.orm import (
    Mapped,
    Session,
    composite,
    declarative_base,
    relationship,
    sessionmaker,
    mapped_column as column,
)
from sqlalchemy.dialects.postgresql import ENUM
from sqlalchemy.ext.hybrid import hybrid_method, hybrid_property
from sqlalchemy.sql.schema import CheckConstraint
from typing import Optional, Any

In [3]:
def backref(back_populates: str) -> Mapped[Any]:
    return relationship(back_populates=back_populates)


def childOf(back_populates: str) -> Mapped[Any]:
    return relationship(
        back_populates=back_populates,
        cascade="all, delete-orphan",
    )

def digits(name:str) -> CheckConstraint:
    return constraint("id ~ '^[0-9]+$'", name=name)

In [4]:
class Households:
    def __init__(self, urban: int, rural: int) -> None:
        self.urban = urban
        self.rural = rural

    def __composite_values__(self) -> tuple[int, ...]:
        return (self.urban, self.rural)

    def __eq__(self, other):
        return isinstance(other, Households) and \
               other.urban == self.urban and \
               other.rural == self.rural
        
    @hybrid_property
    def total(self):
        """Python-side property for total households."""
        return self.urban + self.rural

    @total.expression
    def total(cls):
        """SQL-side expression for querying total households."""
        return cls.urban + cls.rural

class Coordinate:
    """
    A geographic coordinate point with longitude (lon) and latitude (lat) components.
    """
    
    def __init__(self, longitude: float, latitude: float) -> None:
        self.longitude = longitude
        self.latitude = latitude

    @property
    def longitude(self) -> float:
        return self._longitude

    @longitude.setter
    def longitude(self, value: float) -> None:
        if not (-180 <= value <= 180):
            raise ValueError(f"Longitude must be between -180 and 180 degrees, got {value}")
        self._longitude = value

    @property
    def latitude(self) -> float:
        return self._latitude

    @latitude.setter
    def latitude(self, value: float) -> None:
        if not (-90 <= value <= 90):
            raise ValueError(f"Latitude must be between -90 and 90 degrees, got {value}")
        self._latitude = value

    def __composite_values__(self) -> tuple[float, ...]:
        return (self.longitude, self.latitude)

    def __eq__(self, other: "Coordinate") -> bool:
        return isinstance(other, Coordinate) and \
               other.longitude == self.longitude and \
               other.latitude == self.latitude
        
    def __ne__ (self, other: "Coordinate") -> bool:
        return not self.__eq__(other)

    @hybrid_method
    def distance(
        self,
        other: "Coordinate",
        metric: str = 'euclidean'
    ) -> float:
        """
        Calculate distance to another coordinate.
        
        Args:
            other: Coordinate instance
            metric: 'euclidean' or 'manhattan'
        
        Returns:
            Distance between coordinates
        """
        x = self.longitude - other.longitude
        y = self.latitude - other.latitude
        match metric:
            case 'euclidean':
                return sqrt(x**2 + y**2)
            case 'manhattan':
                return abs(x) + abs(y)
            case _:
                raise ValueError("Metric must be 'euclidean' or 'manhattan'")

    @distance.expression
    def distance(
        cls,
        other_lon: float,
        other_lat: float,
        metric: str = 'euclidean'
    ):
        match metric:
            case 'euclidean':
                return func.sqrt(
                    (cls.x - other_x) * (cls.x - other_x) +
                    (cls.y - other_y) * (cls.y - other_y)
                )
            case 'manhattan':
                return func.abs(cls.x - other_x) + func.abs(cls.y - other_y)
            case _:
                raise ValueError("Metric must be 'euclidean' or 'manhattan'")


class Biomes:
    default: dict[str, float] = {
        biome: 0.0 for biome in [
            'amazon_rainforest',
            'atlantic_forest',
            'caatinga',
            'cerrado',
            'pantanal',
            'pampas'
        ]
    }

    def __init__(self, **kwargs) -> None:
        self.distribution = kargs

    def __composite_values__(self) -> tuple[float, ...]:
        return tuple(getattr(self, biome) for biome in self.default.keys())

    @property
    def distribution(self) -> dict[str, float]:
        return {biome: getattr(self, biome) for biome in self.default.keys()}


    @distribution.setter
    def distribution(self, values: dict[str, float]) -> None:
        merged_values = {**self.default, **values}

        # Validation
        invalid_keys = set(merged_values.keys()) - set(self.default.keys())
        if invalid_keys:
            raise ValueError(f"Invalid biome types: {invalid_keys}. Valid types are: {list(self.default.keys())}")
        total = sum(merged_values.values())
        if 99.9 <= total <= 100.1:
            raise ValueError(f"Invalid biome distribution, totalling {total:.1f}%")
        self._distribution = merged_values

        for biome_type, value in merged_values.items():
            setattr(self, biome_type, value)

    @property
    def total(self) -> float:
        return sum(getattr(self, biome) for biome in self.default.keys)

    @classmethod
    def toList(cls) -> list[str]:
        return list(cls.default.keys())

In [5]:
state_enum = ENUM(*[
    "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", 
    "MT", "MS", "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", 
    "RS", "RO", "RR", "SC", "SP", "SE", "TO"
], name="state_enum")

In [12]:
Base = declarative_base()


class Region(Base):
    __tablename__: str = "regions"

    # Attributes
    id: Mapped[str] = column(
        String(1),
        digits("ck_region_id"),
        primary_key=True
    )
    name: Mapped[str] = column(unique=True)

    # Relationships
    states: Mapped[list["State"]] = childOf('region')


class State(Base):
    __tablename__: str = "states"

    # Attributes
    id: Mapped[str] = column(String(1), digits("ck_state_id"))
    name: Mapped[str] = column(unique=True)
    uf: Mapped[str] = column(state_enum, unique=True)
    location: Mapped[Coordinate] = composite(
        column("longitude", Float),
        column("latitude", Float)
    )
    area: Mapped[float]
    biome_distribution: Mapped[Biomes] = composite(
        *[column(biome, Float) for biome in Biomes.toList()]
    )

    # Foreign keys
    region_id: Mapped[str] = column(fk("regions.id"))

    # Relationships
    region: Mapped["Region"] = backref("states")
    cities: Mapped[list["City"]] = childOf("state")

    __table_args__: tuple[pkc, unique,] = (
        pkc("region_id", "id"),
        unique('longitude', 'latitude', name='uq_state_location'),
    )


class City(Base):
    __tablename__: str = "cities"

    # Attributes
    id: Mapped[str] = column(String(5), digits("ck_city_id"))
    name: Mapped[str]
    is_capital: Mapped[bool] = column(default=False, server_default='false')
    location: Mapped[Coordinate] = composite(
        column("longitude", Float),
        column("latitude", Float)
    )
    ddd: Mapped[str] = column(String(2), digits("ck_city_ddd"))
    households: Mapped[Households] = composite(
        column("urban", Integer),
        column("rural", Integer)
    )
    population_race: Mapped[dict] = column(JSON)
    population_education: Mapped[dict] = column(JSON)
    

    # Foreign keys
    timezone_name: Mapped[str] = column(fk("timezones.name"))
    region_id: Mapped[str]
    state_id: Mapped[str]

    # Relationships
    timezone: Mapped["Timezone"] = backref("cities")
    state: Mapped["State"] = backref("cities")

    @hybrid_property
    def ibge_code(self) -> str:
        """Python-side property to get the IBGE code."""
        return self.region_id + self.state_id + self.id

    @ibge_code.expression
    def ibge_code(cls):
        """SQL-side expression for querying."""
        return cast(
            func.concat(
                # Join to State to get the region ID
                cast(cls.region_id, String),
                cast(cls.state_id, String),
                cast(cls.id, String)
            ),
            BigInteger
        )

    __table_args__: tuple[pkc, fkc, unique, ] = (
        pkc("region_id", "state_id", "id"),
        fkc(
            ['region_id', 'state_id'],
            ['states.region_id', 'states.id'],
            name='fk_region_composite'
        ),
        unique("longitude", "latitude", name="uq_city_location"),
        Index(
            'state_capitals_index'
            'region_id', 'state_id',
            postgresql_where=text('is_capital'),
            unique=True,
        )
    )


class Timezone(Base):
    __tablename__: str = "timezones"

    # Attributes
    name: Mapped[str] = column(primary_key=True)
    utc_offset: Mapped[int]

    # Relationships
    cities: Mapped[list["City"]] = backref('timezone')

In [7]:
engine = create_engine(URI + DEFAULT_DATABASE, echo=True)

with engine.connect().execution_options(isolation_level="AUTOCOMMIT") as conn:
    conn.execute(text(f"CREATE DATABASE {CENSO_DATABASE}"))

2025-09-19 20:54:24,954 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-09-19 20:54:24,955 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-09-19 20:54:24,957 INFO sqlalchemy.engine.Engine select current_schema()
2025-09-19 20:54:24,958 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-09-19 20:54:24,960 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-09-19 20:54:24,961 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-09-19 20:54:24,965 INFO sqlalchemy.engine.Engine BEGIN (implicit; DBAPI should not BEGIN due to autocommit mode)
2025-09-19 20:54:24,966 INFO sqlalchemy.engine.Engine CREATE DATABASE censo2022
2025-09-19 20:54:24,967 INFO sqlalchemy.engine.Engine [generated in 0.00229s] {}
2025-09-19 20:54:25,014 INFO sqlalchemy.engine.Engine ROLLBACK using DBAPI connection.rollback(), DBAPI should ignore due to autocommit mode


In [13]:
engine = create_engine(URI + CENSO_DATABASE, echo=True)
Base.metadata.create_all(engine)

2025-09-19 21:19:32,468 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-09-19 21:19:32,469 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-09-19 21:19:32,471 INFO sqlalchemy.engine.Engine select current_schema()
2025-09-19 21:19:32,471 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-09-19 21:19:32,473 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-09-19 21:19:32,474 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-09-19 21:19:32,477 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-09-19 21:19:32,480 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname

In [16]:
# 1. Region Data
regions_data = ["Norte", "Nordeste", "Sudeste", "Sul", "Centro-Oeste"]

# 2. Timezone Data
timezones_data = [
    ('America/Noronha', -2),
    ('America/Sao_Paulo', -3),
    ('America/Brasilia', -3),
    ('America/Recife', -3),
    ('America/Porto_Velho', -4),
    ('America/Manaus', -4),
    ('America/Rio_Branco', -5),
]

Session = sessionmaker(bind=engine)

with Session() as session:
    # Populate the Regions table
    regions = [
        Region(id=str(idx), name=name) for idx, name in enumerate(regions_data, start=1)
    ]
    session.add_all(regions)

    # Populate the Timezones table
    timezones = [Timezone(name=name, utc_offset=offset) for name, offset in timezones_data]
    session.add_all(timezones)

    # Commit the changes to the database
    session.commit()

2025-09-19 21:24:47,102 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-09-19 21:24:47,105 INFO sqlalchemy.engine.Engine INSERT INTO regions (id, name) VALUES (%(id__0)s, %(name__0)s), (%(id__1)s, %(name__1)s), (%(id__2)s, %(name__2)s), (%(id__3)s, %(name__3)s), (%(id__4)s, %(name__4)s)
2025-09-19 21:24:47,109 INFO sqlalchemy.engine.Engine [cached since 222s ago (insertmanyvalues) 1/1 (unordered)] {'name__0': 'Norte', 'id__0': '1', 'name__1': 'Nordeste', 'id__1': '2', 'name__2': 'Sudeste', 'id__2': '3', 'name__3': 'Sul', 'id__3': '4', 'name__4': 'Centro-Oeste', 'id__4': '5'}
2025-09-19 21:24:47,116 INFO sqlalchemy.engine.Engine INSERT INTO timezones (name, utc_offset) VALUES (%(name__0)s, %(utc_offset__0)s), (%(name__1)s, %(utc_offset__1)s), (%(name__2)s, %(utc_offset__2)s), (%(name__3)s, %(utc_offset__3)s), (%(name__4)s, %(utc_offset__4)s), (%(name__5)s, %(utc_offset__5)s), (%(name__6)s, %(utc_offset__6)s)
2025-09-19 21:24:47,117 INFO sqlalchemy.engine.Engine [cached since 222s ag