In [5]:
import pandas as pd
import duckdb
import os

import requests
import json
from datetime import datetime
import time
import pytz
import re

from typing import Optional
import uuid as uuid_pkg

from sqlmodel import SQLModel, Field, Session, create_engine, select
from typing import Optional

from unidecode import unidecode
import os

In [3]:
class ENARMResult(SQLModel, table=True):
    __tablename__ = "enarm_results"  # Optional, you can omit if you want table name to match class name automatically

    id: Optional[int] = Field(default=None, primary_key=True)
    estado: Optional[str]
    facultad: Optional[str]
    sustentante: Optional[int]
    seleccionado: Optional[int]
    promedio: Optional[str]
    acceptance_rate: Optional[float]
    date_id: Optional[str]
    year: Optional[str]
    acceptance_rate_plus_1_std: Optional[float]
    acceptance_rate_minus_1_std: Optional[float]
    z_index: Optional[float]
    estado_id: Optional[str]
    school_id: Optional[str]

In [9]:
# files = os.listdir('data')
# files = sorted([f for f in files if f.endswith('.csv')])[1:]
# files = sorted(files, reverse=True)

base_url = "https://videostostore.blob.core.windows.net/enarmapp"
base_list = ['enarm_2013.csv','enarm_2014.csv','enarm_2015.csv','enarm_2016.csv','enarm_2017.csv','enarm_2018.csv',
             'enarm_2019.csv','enarm_2020.csv','enarm_2021.csv','enarm_2022.csv','enarm_2023.csv','enarm_2024.csv']

In [10]:
def clean_string(s):
    # Convert to lowercase
    s = s.lower()
    # Replace spaces with underscores
    s = s.replace(" ", "_")
    # Remove special characters
    s = ''.join(e for e in s if e.isalnum() or e == '_')
    # Replace accented characters
    accents = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'Á': 'a', 'É': 'e', 'Í': 'i', 'Ó': 'o', 'Ú': 'u',
        'ñ': 'n', 'Ñ': 'n'
    }
    for accent, replacement in accents.items():
        s = s.replace(accent, replacement)
    return s

listOfDates = [i.split(".")[0] for i in base_list]

procc_list = []

for index_file, read_file in enumerate(base_list):
    # print(listOfDates[index_file])
    # print(read_file)
    procc_file = duckdb.sql(f"""
            WITH base as (SELECT 
            CASE WHEN "Entidad Federativa" = 'DISTRITO FEDERAL' THEN 'CIUDAD DE MÉXICO' ELSE "Entidad Federativa" END as estado,
            "Facultad o Escuela de Medicina" as facultad,
            "Sustentante" as sustentante,
            "Seleccionado" as seleccionado,
            "Promedio General de Conocimientos Médicos" as promedio,
            ROUND(("Seleccionado"::INTEGER / "Sustentante"::INTEGER) * 100,2) as acceptance_rate,
            '{listOfDates[index_file]}' as date_id,
            '{listOfDates[index_file].split('_')[1]}' as year,
            FROM read_csv('{base_url}/{read_file}') WHERE "Sustentante" > 0 ),
            stats AS (
                SELECT 
                    AVG(acceptance_rate) AS mean_rate, 
                    STDDEV(acceptance_rate) AS stddev_rate 
                FROM base
            ),
            enhanced AS (
                SELECT *,
                    mean_rate,
                    stddev_rate,
                    ROUND(acceptance_rate + stddev_rate,2) AS acceptance_rate_plus_1_std,
                    ROUND(acceptance_rate - stddev_rate,2) AS acceptance_rate_minus_1_std,
                    ROUND((acceptance_rate - mean_rate) / stddev_rate,2) AS z_index
                FROM base, stats
            )
            SELECT * EXCLUDE (mean_rate,mean_rate_1,stddev_rate_1, stddev_rate)
            FROM enhanced 
        """).df()
    
    procc_list.append(procc_file)

dataframe_full = pd.concat(procc_list, ignore_index=True)

## On the column estado in the dataframe_full if there are None values replace them with the string "Otro"
dataframe_full['estado'] = dataframe_full['estado'].fillna('OTRA')
dataframe_full['estado_id'] = dataframe_full['estado'].apply(clean_string)
dataframe_full['school_id'] = dataframe_full['facultad'].apply(clean_string)

dataframe_full['sustentante'] = dataframe_full['sustentante'].fillna(0)
dataframe_full['sustentante'] = dataframe_full['sustentante'].astype(int)
dataframe_full['seleccionado'] = dataframe_full['seleccionado'].fillna(0)
dataframe_full['seleccionado'] = dataframe_full['seleccionado'].astype(int)

In [None]:
duckdb.sql(""" SELECT * FROM dataframe_full WHERE school_id = 'inst_tecnologico_est_sup_mty__campus_jalisco'""")

In [15]:
duckdb.sql(""" SELECT * FROM dataframe_full WHERE school_id = 'univ_aut_de_bc_u_mexicali'""")

┌─────────────────┬─────────────────────────────────┬─────────────┬──────────────┬──────────┬─────────────────┬────────────┬─────────┬────────────────────────────┬─────────────────────────────┬─────────┬─────────────────┬───────────────────────────┐
│     estado      │            facultad             │ sustentante │ seleccionado │ promedio │ acceptance_rate │  date_id   │  year   │ acceptance_rate_plus_1_std │ acceptance_rate_minus_1_std │ z_index │    estado_id    │         school_id         │
│     varchar     │             varchar             │    int64    │    int64     │  double  │     double      │  varchar   │ varchar │           double           │           double            │ double  │     varchar     │          varchar          │
├─────────────────┼─────────────────────────────────┼─────────────┼──────────────┼──────────┼─────────────────┼────────────┼─────────┼────────────────────────────┼─────────────────────────────┼─────────┼─────────────────┼───────────────────────────┤


In [16]:
duckdb.sql(""" SELECT * FROM dataframe_full WHERE date_id = 'enarm_2024' ORDER BY z_index DESC LIMIT 10""")

┌──────────────────┬───────────────────────────────────────────────────┬─────────────┬──────────────┬──────────┬─────────────────┬────────────┬─────────┬────────────────────────────┬─────────────────────────────┬─────────┬──────────────────┬──────────────────────────────────────────────┐
│      estado      │                     facultad                      │ sustentante │ seleccionado │ promedio │ acceptance_rate │  date_id   │  year   │ acceptance_rate_plus_1_std │ acceptance_rate_minus_1_std │ z_index │    estado_id     │                  school_id                   │
│     varchar      │                      varchar                      │    int64    │    int64     │  double  │     double      │  varchar   │ varchar │           double           │           double            │ double  │     varchar      │                   varchar                    │
├──────────────────┼───────────────────────────────────────────────────┼─────────────┼──────────────┼──────────┼─────────────────┼───

In [11]:
def insert_schools_data(df: pd.DataFrame, class_i=None, duckdb_table: str = "dim_schools",duckdb_path:str="/Users/gerardomartinez/Documents/production/enarm/frontend/src/data/enarm.duckdb"):
    """Convert a pandas DataFrame into a a list of SQLModel objects."""
    
    data = [class_i(**row) for row in df.to_dict('records')]

    # Insert into DuckDB
    with duckdb.connect(duckdb_path) as con:
        # Create or replace table in DuckDB
        con.execute(f"CREATE TABLE IF NOT EXISTS {duckdb_table} AS SELECT * FROM df LIMIT 0")  # Create with structure only
        con.execute(f"INSERT INTO {duckdb_table} SELECT * FROM df")
        con.close()
    
    print("")
    print(f"Inserted {len(data)} shows into the database.")

In [13]:
insert_schools_data(df=dataframe_full, class_i=ENARMResult, duckdb_table="enarm_results", duckdb_path="C:/Users/gerym/Documents/Projects/enarm_repo_mx/frontend/src/data/enarm.duckdb")


Inserted 1482 shows into the database.


In [12]:
def delete_table(duckdb_table: str, duckdb_path: str):
    """Delete a table from the DuckDB database."""
    with duckdb.connect(duckdb_path) as con:
        con.execute(f"DROP TABLE IF EXISTS {duckdb_table}")
        con.close()
    print(f"Table '{duckdb_table}' has been deleted from the database at '{duckdb_path}'.")

# Usage
delete_table(duckdb_table="enarm_results", duckdb_path="C:/Users/gerym/Documents/Projects/enarm_repo_mx/frontend/src/data/enarm.duckdb")

Table 'enarm_results' has been deleted from the database at 'C:/Users/gerym/Documents/Projects/enarm_repo_mx/frontend/src/data/enarm.duckdb'.


In [None]:
# Insert into DuckDB
with duckdb.connect(duckdb_path) as con:
    # Create or replace table in DuckDB
    con.execute(f"CREATE TABLE IF NOT EXISTS {duckdb_table} AS SELECT * FROM df LIMIT 0")  # Create with structure only
    con.execute(f"INSERT INTO {duckdb_table} SELECT * FROM df")
    con.close()

In [17]:
duckdb.sql("""SELECT sustentante, seleccionado,
           promedio, acceptance_rate,year,school_id,estado_id
            FROM dataframe_full WHERE school_id = 'univ_aut_de_guadalajara'""")

┌─────────────┬──────────────┬──────────┬─────────────────┬─────────┬─────────────────────────┬───────────┐
│ sustentante │ seleccionado │ promedio │ acceptance_rate │  year   │        school_id        │ estado_id │
│    int64    │    int64     │ varchar  │     double      │ varchar │         varchar         │  varchar  │
├─────────────┼──────────────┼──────────┼─────────────────┼─────────┼─────────────────────────┼───────────┤
│        1866 │          565 │ 53.7991  │           30.28 │ 2024    │ univ_aut_de_guadalajara │ jalisco   │
│        1574 │          635 │ 56.3136  │           40.34 │ 2023    │ univ_aut_de_guadalajara │ jalisco   │
│        1638 │          661 │ 55.36    │           40.35 │ 2022    │ univ_aut_de_guadalajara │ jalisco   │
│        1917 │          700 │ 56.07    │           36.52 │ 2021    │ univ_aut_de_guadalajara │ jalisco   │
│        1915 │          633 │ 58.51    │           33.05 │ 2020    │ univ_aut_de_guadalajara │ jalisco   │
│        1803 │          349

In [13]:
### Selected by Volume 
duckdb.sql("""FROM dataframe_full WHERE date_id = 'enarm_2024' ORDER BY seleccionado DESC LIMIT 10""")

┌──────────────────┬─────────────────────────────────────────────────┬─────────────┬──────────────┬──────────┬─────────────────┬────────────┬─────────┬────────────────────────────┬─────────────────────────────┬─────────┬──────────────────┬─────────────────────────────────────────────┐
│      estado      │                    facultad                     │ sustentante │ seleccionado │ promedio │ acceptance_rate │  date_id   │  year   │ acceptance_rate_plus_1_std │ acceptance_rate_minus_1_std │ z_index │    estado_id     │                  school_id                  │
│     varchar      │                     varchar                     │    int64    │    int64     │ varchar  │     double      │  varchar   │ varchar │           double           │           double            │ double  │     varchar      │                   varchar                   │
├──────────────────┼─────────────────────────────────────────────────┼─────────────┼──────────────┼──────────┼─────────────────┼────────────┼─

In [22]:
min_max_yrs = os.listdir('C:/Users/gerym/Documents/Projects/enarm_repo_mx/data/puntajes_min_max')

In [23]:
several_years = []

for i in min_max_yrs:

    a = duckdb.sql(f"""  
    with base as (SELECT * EXCLUDE("Puntaje Mínimo", "Puntaje Máximo"), '{i}' as year, 
    "Puntaje Mínimo" as puntaje_min, "Puntaje Máximo" as puntaje_max
    FROM read_csv('./data/puntajes_min_max/{i}')
    WHERE categoria = 'mexicana')

    SELECT *, (puntaje_max - puntaje_min) as diff from base
    """).df()
    print(i, end=' ')
    several_years.append(a)

puntajes = pd.concat(several_years).reset_index(drop=True).copy()

puntajes['especialidad_id'] = puntajes['ESPECIALIDAD'].apply(
    lambda x: unidecode(x).lower().replace(' ', '_')
)


2012.csv 2013.csv 2014.csv 2015.csv 2016.csv 2017.csv 2018.csv 2019.csv 2020.csv 2021.csv 2022.csv 2023.csv 2024.csv 

In [24]:
min_dataframe = duckdb.sql("""
WITH base AS (
    SELECT 
        * EXCLUDE(diff),
        ROUND(AVG(puntaje_max) OVER (PARTITION BY especialidad_id), 2) AS avg_puntaje_max,
        ROUND(AVG(puntaje_min) OVER (PARTITION BY especialidad_id), 2) AS avg_puntaje_min,
        ROUND(STDDEV_SAMP(puntaje_max) OVER (PARTITION BY especialidad_id), 2) AS std_puntaje_max,
        ROUND(STDDEV_SAMP(puntaje_min) OVER (PARTITION BY especialidad_id), 2) AS std_puntaje_min,
        ROUND(diff, 2) AS diff
    FROM puntajes
)
SELECT 
    especialidad_id, 
    year, 
    puntaje_min,
    avg_puntaje_min,
    avg_puntaje_min + std_puntaje_min AS std_puntaje_min_plus_1,
    avg_puntaje_min - std_puntaje_min AS std_puntaje_min_minus_1,
    puntaje_max,
    avg_puntaje_max,
    avg_puntaje_max + std_puntaje_max AS std_puntaje_max_plus_1,
    avg_puntaje_max - std_puntaje_max AS std_puntaje_max_minus_1,
    diff
FROM base
""").df()


In [26]:
duckdb_path = "C:/Users/gerym/Documents/Projects/enarm_repo_mx/frontend/src/data/enarm.duckdb"

# Insert into DuckDB
with duckdb.connect(duckdb_path) as con:
    # Create or replace table in DuckDB
    con.execute(f"CREATE TABLE IF NOT EXISTS enarm_min_max AS SELECT * FROM min_dataframe LIMIT 0")  # Create with structure only
    con.execute(f"INSERT INTO enarm_min_max SELECT * FROM min_dataframe")
    con.close()

In [25]:
"""Delete a table from the DuckDB database."""
with duckdb.connect(duckdb_path) as con:
    con.execute(f"DROP TABLE IF EXISTS enarm_min_max")
    con.close()
