In [1]:
DEFAULT_DATABASE = "postgres"
NOBEL_DATABASE = "nobel"
USER = "postgres"
PASSWORD = "postgres"
HOST = "localhost"
PORT = 5432
URI = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}/"

In [2]:
import datetime as dt
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import unicodedata
import re
import json
import csv
from enum import Enum
from dataclasses import dataclass, fields
from math import sqrt
from matplotlib.patches import Patch
from sqlalchemy import (
    Column,
    Float,
    Integer,
    Index,
    String,
    Text,
    CheckConstraint as constraint,
    UniqueConstraint as unique,
    PrimaryKeyConstraint as pkc,
    ForeignKeyConstraint as fkc,
    ForeignKey as fk,
    and_,
    cast,
    create_engine,
    insert,
    text,
    func,
    select,
    literal,
    update,
    values
)
from sqlalchemy.orm import (
    Mapped,
    Session,
    declarative_base,
    declared_attr,
    relationship,
    sessionmaker,
    mapped_column as column,
    validates
)
from sqlalchemy.engine.interfaces import Dialect
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.hybrid import hybrid_method, hybrid_property
from sqlalchemy.sql.schema import CheckConstraint
from sqlalchemy.orm.descriptor_props import Composite
from sqlalchemy.ext.mutable import MutableList
from sqlalchemy.schema import Column
from sqlalchemy.types import UserDefinedType
from typing import Optional, Any, final

In [3]:
engine = create_engine(URI + DEFAULT_DATABASE, echo=True)

with engine.connect().execution_options(isolation_level="AUTOCOMMIT") as conn:
    terminate_sql = text(f"""
        SELECT pg_terminate_backend(pid)
        FROM pg_stat_activity
        WHERE datname = '{NOBEL_DATABASE}';
    """)
    try:
        conn.execute(terminate_sql)
    except ProgrammingError as e:
        print(f"Could not terminate connections (this is often normal): {e}")
    conn.execute(text(f"DROP DATABASE IF EXISTS {NOBEL_DATABASE};"))
    conn.execute(text(f"CREATE DATABASE {NOBEL_DATABASE};")) 

2025-10-03 13:13:12,240 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-10-03 13:13:12,242 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-03 13:13:12,244 INFO sqlalchemy.engine.Engine select current_schema()
2025-10-03 13:13:12,246 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-03 13:13:12,248 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-10-03 13:13:12,249 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-03 13:13:12,252 INFO sqlalchemy.engine.Engine BEGIN (implicit; DBAPI should not BEGIN due to autocommit mode)
2025-10-03 13:13:12,253 INFO sqlalchemy.engine.Engine 
        SELECT pg_terminate_backend(pid)
        FROM pg_stat_activity
        WHERE datname = 'nobel';
    
2025-10-03 13:13:12,254 INFO sqlalchemy.engine.Engine [generated in 0.00190s] {}
2025-10-03 13:13:12,263 INFO sqlalchemy.engine.Engine DROP DATABASE IF EXISTS nobel;
2025-10-03 13:13:12,264 INFO sqlalchemy.engine.Engine [generated in 0.00150s] {}
2025-10-03 13:13:12,292

In [4]:
engine = create_engine(URI + NOBEL_DATABASE, echo=True)

In [5]:
Base = declarative_base()

class DocumentTable(Base):
    __abstract__: bool = True
    id: Mapped[int] = column(primary_key=True, autoincrement=True)
    document: Mapped[dict] = column(JSONB)


@final
class Premiado(DocumentTable):
    __tablename__ = "Premiados"


@final
class Premio(DocumentTable):
    __tablename__ = "Premios"


@final
class Pais(DocumentTable):
    __tablename__ = "Paises"

Base.metadata.create_all(engine)

2025-10-03 13:13:12,399 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-10-03 13:13:12,400 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-03 13:13:12,401 INFO sqlalchemy.engine.Engine select current_schema()
2025-10-03 13:13:12,402 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-03 13:13:12,403 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-10-03 13:13:12,404 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-03 13:13:12,406 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-10-03 13:13:12,415 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname

In [6]:
entries = []

for filename, key, Table in zip(
    ["Laureate", "Country", "prize"],
    ["laureates", "countries", "prizes"],
    [Premiado, Pais, Premio]
):
    with open(f'datasets/{filename}.json', 'r', encoding='utf-8') as file:
        items = json.load(file)[key]
        entries += [Table(document=item) for item in items]

Session = sessionmaker(bind=engine)

with Session() as session:
    session.add_all(entries)
    session.commit()

2025-10-03 13:13:12,606 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-10-03 13:13:12,611 INFO sqlalchemy.engine.Engine INSERT INTO "Paises" (document) SELECT p0::JSONB FROM (VALUES (%(document__0)s::JSONB, 0), (%(document__1)s::JSONB, 1), (%(document__2)s::JSONB, 2), (%(document__3)s::JSONB, 3), (%(document__4)s::JSONB, 4), (%(document__5)s::JSONB, 5), (%(document__6 ... 4105 characters truncated ... , 136)) AS imp_sen(p0, sen_counter) ORDER BY sen_counter RETURNING "Paises".id, "Paises".id AS id__1
2025-10-03 13:13:12,612 INFO sqlalchemy.engine.Engine [generated in 0.00085s (insertmanyvalues) 1/1 (ordered)] {'document__0': '{"name": "Algeria", "code": "DZ"}', 'document__1': '{"name": "Argentina", "code": "AR"}', 'document__2': '{"name": "Australia", "code": "AU"}', 'document__3': '{"name": "Austria", "code": "AT"}', 'document__4': '{"name": "Austria-Hungary"}', 'document__5': '{"name": "Austrian Empire"}', 'document__6': '{"name": "Azerbaijan", "code": "AZ"}', 'document__7': '{"

In [8]:
update_params = []
with open(f'datasets/Specialties.csv', 'r', encoding='utf-8') as file:
    specialty_data = list(csv.DictReader(file))
    for row in specialty_data:
        name_parts = row['Laureate'].split('.')
        initials = name_parts[:-1]
        surname = name_parts[-1].lstrip()
        like_pattern = '% '.join(initials) + '%'
        update_params.append({
            "p_surname": surname,
            "p_like_pattern": like_pattern,
            "p_specialty": row['Specialty']
        })

updated_documents = []
with Session() as session:
    # Create a VALUES clause to hold our data
    value_columns = [Column(name, String) for name in update_params[0].keys()]
    vals = values(
        *value_columns,
        name="update_data"
    ).data(update_params)

    # Construct the UPDATE...FROM...VALUES statement
    stmt = (
        update(Premiado)
        .where(
            Premiado.document['surname'].as_string() == vals.c.p_surname,
            Premiado.document['firstname'].as_string().like(vals.c.p_like_pattern)
        )
        .values(
            document=Premiado.document.concat(
                func.jsonb_build_object('specialty', vals.c.p_specialty)
            )
        )
        .returning(Premiado.document)
    )
    result_proxy = session.execute(stmt)
    updated_documents += [row.document for row in result_proxy]
    session.commit()

if updated_documents:
    print(f"\n✅ Successfully updated and fetched {len(updated_documents)} records.")
    print("--- Updated Documents ---")
    for doc in updated_documents:
        # Use json.dumps for pretty-printing with indentation
        print(json.dumps(doc, indent=4))
        print("-" * 20)
else:
    print("\nNo records were updated.")

2025-10-03 13:13:44,473 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-10-03 13:13:44,502 INFO sqlalchemy.engine.Engine UPDATE "Premiados" SET document=("Premiados".document || jsonb_build_object(%(jsonb_build_object_1)s, update_data.p_specialty)) FROM (VALUES (%(param_1)s, %(param_2)s, %(param_3)s), (%(param_4)s, %(param_5)s, %(param_6)s), (%(param_7)s, %(param_8)s, %(param_9)s), (%(param_10)s, %(param_11)s, %(param_12)s), (%(param_13)s, %(param_14)s, %(param_15)s), (%(param_16)s, %(param_17)s, %(param_18)s), (%(param_19)s, %(param_20)s, %(param_21)s), (%(param_22)s, %(param_23)s, %(param_24)s), (%(param_25)s, %(param_26)s, %(param_27)s), (%(param_28)s, %(param_29)s, %(param_30)s), (%(param_31)s, %(param_32)s, %(param_33)s), (%(param_34)s, %(param_35)s, %(param_36)s), (%(param_37)s, %(param_38)s, %(param_39)s), (%(param_40)s, %(param_41)s, %(param_42)s), (%(param_43)s, %(param_44)s, %(param_45)s), (%(param_46)s, %(param_47)s, %(param_48)s), (%(param_49)s, %(param_50)s, %(param_51

## 