In [47]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text

# Instância de conexão com o banco de dados

In [48]:
usuario = "root"
senha = "root"
url = "localhost"
porta = "3306"
banco = "ENEM_OLTP"

# Criar uma conexão com o banco de dados MySQL
engine = create_engine(f"mysql+pymysql://{usuario}:{senha}@{url}:{porta}/{banco}")

# Instância de conexão com o banco de dados DataMart

In [49]:
usuario = "root"
senha = "root"
url = "localhost"
porta = "3306"
banco = "ENEM_DM"

# Criar uma conexão com o banco de dados MySQL
engine_dm = create_engine(f"mysql+pymysql://{usuario}:{senha}@{url}:{porta}/{banco}")

In [51]:
query = "SELECT * FROM PARTICIPANTES"
df = pd.read_sql(query, con=engine)

colunas = df.columns.tolist()

df["SK_PARTICIPANTES"] = np.arange(1, len(df) + 1)

df = df[["SK_PARTICIPANTES"] + colunas]

dimensoes = [
    "DIM_FAIXA_ETARIA",
    "DIM_SEXO",
    "DIM_ESTADO_CIVIL",
    "DIM_COR_RACA",
    "DIM_NACIONALIDADE",
    "DIM_ST_CONCLUSAO",
    "DIM_ANO_CONCLUIU",
    "DIM_ESCOLA",
    "DIM_ENSINO",
    "DIM_TREINEIRO",
]

for dimensao in dimensoes:
    query = f"SELECT * FROM {dimensao}"
    df_dim = pd.read_sql(query, con=engine_dm)
    df = df.merge(df_dim, on=df_dim.columns[1], how="left")
    df = df.drop(columns=df_dim.columns[1:])

colunas = df.columns.tolist()

colunas_sk = [
    coluna
    for coluna in colunas
    if coluna.startswith("SK_") and not coluna.startswith("SK_PARTICIPANTES")
]

df = df.replace(np.nan, None)

display(df)

with engine_dm.connect() as connection:
    connection.execute(text(f"DROP TABLE IF EXISTS DIM_PARTICIPANTES"))
    df.to_sql(f"DIM_PARTICIPANTES", con=connection, index=False)
    connection.execute(
        text(
            """
            ALTER TABLE DIM_PARTICIPANTES
            ADD PRIMARY KEY (SK_PARTICIPANTES)
            """
        )
    )
    connection.execute(
        text(
            """
            ALTER TABLE DIM_PARTICIPANTES
            MODIFY COLUMN SK_SEXO BIGINT
            """
        )
    )

    connection.execute(
        text(
            """
            ALTER TABLE DIM_PARTICIPANTES
            MODIFY COLUMN SK_ENSINO BIGINT
            """
        )
    )
    for coluna in colunas_sk:
        connection.execute(
            text(
                f"""
                ALTER TABLE DIM_PARTICIPANTES
                ADD CONSTRAINT FK_DIM_PARTICIPANTES_{coluna}
                FOREIGN KEY ({coluna}) REFERENCES DIM_{coluna[3:]}({coluna}) ON DELETE CASCADE
                """
            )
        )

print(f"Dimensão PARTICIPANTES criada com sucesso! Total de registros: {len(df)}")

Dimensão PARTICIPANTES criada com sucesso! Total de registros: 1000000


In [52]:
query = "SELECT * FROM NOTAS"
df = pd.read_sql(query, con=engine)

colunas = df.columns.tolist()

df["SK_NOTAS"] = np.arange(1, len(df) + 1)

df = df[["SK_NOTAS"] + colunas]

dimensoes = [
    "DIM_PARTICIPANTES",
    "DIM_PRESENCA_CN",
    "DIM_PRESENCA_CH",
    "DIM_PRESENCA_LC",
    "DIM_PRESENCA_MT",
    "DIM_LINGUA",
    "DIM_STATUS_REDACAO",
]

for dimensao in dimensoes:
    query = f"SELECT * FROM {dimensao}"
    df_dim = pd.read_sql(query, con=engine_dm)
    df = df.merge(df_dim, on=df_dim.columns[1], how="left")
    df = df.drop(columns=df_dim.columns[1:])

colunas = df.columns.tolist()

colunas_sk = [
    coluna
    for coluna in colunas
    if coluna.startswith("SK_") and not coluna.startswith("SK_NOTAS")
]

colunas_notas = [
    coluna
    for coluna in colunas
    if coluna.startswith("NU_") and not coluna.startswith("NU_INSCRICAO")
]

df = df.replace(np.nan, None)

df = df[["SK_NOTAS"] + colunas_sk + colunas_notas]

display(df)

with engine_dm.connect() as connection:
    connection.execute(text(f"DROP TABLE IF EXISTS FAT_NOTAS"))
    df.to_sql(f"FAT_NOTAS", con=connection, index=False)
    connection.execute(
        text(
            """
            ALTER TABLE FAT_NOTAS
            ADD PRIMARY KEY (SK_NOTAS)
            """
        )
    )
    connection.execute(
        text(
            """
            ALTER TABLE FAT_NOTAS
            MODIFY COLUMN SK_STATUS_REDACAO BIGINT
            """
        )
    )

    for coluna in colunas_notas:
        connection.execute(
            text(
                f"""
                ALTER TABLE FAT_NOTAS
                MODIFY COLUMN {coluna} DECIMAL(5,1)
                """
            )
        )

    for coluna in colunas_sk:
        connection.execute(
            text(
                f"""
                ALTER TABLE FAT_NOTAS
                ADD CONSTRAINT FK_FAT_NOTAS_{coluna}
                FOREIGN KEY ({coluna}) REFERENCES DIM_{coluna[3:]}({coluna}) ON DELETE CASCADE
                """
            )
        )

print(f"Fato NOTAS criada com sucesso! Total de registros: {len(df)}")

Unnamed: 0,SK_NOTAS,SK_PARTICIPANTES,SK_PRESENCA_CN,SK_PRESENCA_CH,SK_PRESENCA_LC,SK_PRESENCA_MT,SK_LINGUA,SK_STATUS_REDACAO,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO
0,1,246779,1,1,1,1,1,,,,,,
1,2,445914,1,1,1,1,1,,,,,,
2,3,924947,2,2,2,2,2,1.0,502.0,498.9,475.6,363.2,700.0
3,4,650884,2,2,2,2,1,1.0,459.0,508.5,507.2,466.7,880.0
4,5,569802,2,2,2,2,1,1.0,402.5,379.2,446.9,338.3,560.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999996,547625,2,2,2,2,1,5.0,446.5,498.7,529.3,366.4,0.0
999996,999997,936224,2,2,2,2,1,1.0,441.6,463.9,437.5,425.1,520.0
999997,999998,187122,2,2,2,2,2,5.0,346.1,373.5,311.8,404.1,0.0
999998,999999,171697,2,2,2,2,2,1.0,530.5,548.7,549.7,560.2,840.0


Fato NOTAS criada com sucesso! Total de registros: 1000000


In [54]:
query = "SELECT * FROM QUESTIONARIO"
df = pd.read_sql(query, con=engine)

colunas = df.columns.tolist()

df["SK_QUESTIONARIO"] = np.arange(1, len(df) + 1)

df = df[["SK_QUESTIONARIO"] + colunas]

dimensoes = [
    "DIM_PARTICIPANTES",
] + [
    f"DIM_RESP_QUESTAO_{i}"
    for i in range(1, 26)
]

for dimensao in dimensoes:
    query = f"SELECT * FROM {dimensao}"
    df_dim = pd.read_sql(query, con=engine_dm)
    df = df.merge(df_dim, on=df_dim.columns[1], how="left")
    df = df.drop(columns=df_dim.columns[1:])

colunas = df.columns.tolist()

colunas_sk = [
    coluna
    for coluna in colunas
    if coluna.startswith("SK_") and not coluna.startswith("SK_QUESTIONARIO")
]

df = df.replace(np.nan, None)

df = df[["SK_QUESTIONARIO"] + colunas_sk]

display(df)

with engine_dm.connect() as connection:
    connection.execute(text(f"DROP TABLE IF EXISTS FAT_QUESTIONARIO"))
    df.to_sql(f"FAT_QUESTIONARIO", con=connection, index=False)
    connection.execute(
        text(
            """
            ALTER TABLE FAT_QUESTIONARIO
            ADD PRIMARY KEY (SK_QUESTIONARIO)
            """
        )
    )
    # connection.execute(
    #     text(
    #         """
    #         ALTER TABLE FAT_QUESTIONARIO
    #         MODIFY COLUMN SK_RESP_QUESTAO_5 BIGINT
    #         """
    #     )
    # )

    for coluna in colunas_sk:
        connection.execute(
            text(
                f"""
                ALTER TABLE FAT_QUESTIONARIO
                ADD CONSTRAINT FK_FAT_QUESTIONARIO_{coluna}
                FOREIGN KEY ({coluna}) REFERENCES DIM_{coluna[3:]}({coluna}) ON DELETE CASCADE
                """
            )
        )

print(f"Fato QUESTIONARIO criada com sucesso! Total de registros: {len(df)}")

Unnamed: 0,SK_QUESTIONARIO,SK_PARTICIPANTES,SK_RESP_QUESTAO_1,SK_RESP_QUESTAO_2,SK_RESP_QUESTAO_3,SK_RESP_QUESTAO_4,SK_RESP_QUESTAO_5,SK_RESP_QUESTAO_6,SK_RESP_QUESTAO_7,SK_RESP_QUESTAO_8,...,SK_RESP_QUESTAO_16,SK_RESP_QUESTAO_17,SK_RESP_QUESTAO_18,SK_RESP_QUESTAO_19,SK_RESP_QUESTAO_20,SK_RESP_QUESTAO_21,SK_RESP_QUESTAO_22,SK_RESP_QUESTAO_23,SK_RESP_QUESTAO_24,SK_RESP_QUESTAO_25
0,1,246779,1,6,5,4,5,6,3,3,...,3,3,2,2,1,2,2,1,1,2
1,2,445914,6,5,5,2,3,8,1,2,...,2,1,2,2,1,1,3,1,4,2
2,3,924947,8,5,3,6,5,3,1,2,...,2,1,1,2,1,1,1,1,1,2
3,4,650884,4,4,2,2,5,3,1,2,...,1,1,1,2,1,1,4,1,1,2
4,5,569802,2,2,1,1,4,2,1,2,...,1,1,1,2,1,1,2,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999996,547625,5,3,2,3,4,5,1,2,...,2,1,1,2,2,2,5,1,1,2
999996,999997,936224,2,4,1,1,3,3,1,2,...,2,1,1,1,1,1,3,1,1,2
999997,999998,187122,3,3,2,3,4,2,1,2,...,1,1,1,2,1,1,2,1,1,1
999998,999999,171697,1,2,1,1,2,1,1,1,...,1,1,1,1,1,1,1,1,1,1


Fato QUESTIONARIO criada com sucesso! Total de registros: 1000000
