### Configurando caminhos


In [1]:
import os

# URL da fonte de dados
URL = 'https://aplicacoes.mds.gov.br/sagi/servicos/misocial/?fq=anomes_s:2024*&fl=codigo_ibge%2Canomes_s%2Cqtd_familias_beneficiarias_bolsa_familia_s%2Cvalor_repassado_bolsa_familia_s%2Cpbf_vlr_medio_benef_f&fq=valor_repassado_bolsa_familia_s%3A*&q=*%3A*&rows=100000&sort=anomes_s%20desc%2C%20codigo_ibge%20asc&wt=csv'
# Caminho local onde o arquivo será salvo
workdir = './work/'
default_file = 'bolsa-familia2024.csv'
workdir_files = os.listdir(workdir)

# Caminho do driver jdbc
jdbc_driver_path = "/opt/trabalhos/etl-mdd/postgresql-42.7.1.jar"
file_path = workdir + default_file
full_paths = [os.path.join(workdir, file) for file in workdir_files]
dataframes_list = []


### Importando JSON com informações de UF e Município

In [2]:
import json

uf_code_path = './utils/ibge-codes/uf-code.json'
municipios_code_path = './utils/ibge-codes/municipios-code.json'
uf_dict, municipios_dict = {}, {}

with open(uf_code_path, 'r') as file:
    uf_dict = json.load(file)

with open(municipios_code_path, 'r') as file:
    municipios_dict = json.load(file)


### Instalando pyspark


In [3]:
%pip install --user pyspark

Note: you may need to restart the kernel to use updated packages.


### Criando sessão spark

In [4]:
from pyspark.sql import SparkSession
# Cria a sessão spark
spark_session = SparkSession.builder.appName('spark') \
                                    .config("spark.driver.extraClassPath", jdbc_driver_path) \
                                    .config("spark.executor.memory", "4g") \
                                    .getOrCreate()

your 131072x1 screen size is bogus. expect trouble
24/06/18 19:30:40 WARN Utils: Your hostname, RRNWRESID05 resolves to a loopback address: 127.0.1.1; using 172.23.195.127 instead (on interface eth0)
24/06/18 19:30:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/18 19:30:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Lendo arquivo csv e montando dataframe do spark

In [5]:
df = spark_session.read.options(header="true", delimiter=",", encoding="ISO-8859-1", inferSchema=True).csv(file_path)

for arquivo in full_paths:
    dataframes_list.append(spark_session.read.options(header="true", delimiter=",", encoding="ISO-8859-1", inferSchema=True).csv(arquivo))

column_changes = [
    ("ibge", "codigo_ibge"),
    ("anomes", "anomes_s"),
    ("qtd_familias_beneficiarias_bolsa_familia", "qtd_familias_beneficiarias_bolsa_familia_s"),
    ("valor_repassado_bolsa_familia", "valor_repassado_bolsa_familia_s")
]

### Adequando nomes de coluna
for index, ano_csv in enumerate(dataframes_list):
    for nome_antigo, nome_padrao in column_changes:
        if nome_antigo in ano_csv.columns:
            ano_csv = ano_csv.withColumnRenamed(nome_antigo, nome_padrao)

    dataframes_list[index] = ano_csv

for ano_csv in dataframes_list:
    ano_csv.show()   

# df.show()

+-----------+--------+------------------------------------------+-------------------------------+
|codigo_ibge|anomes_s|qtd_familias_beneficiarias_bolsa_familia_s|valor_repassado_bolsa_familia_s|
+-----------+--------+------------------------------------------+-------------------------------+
|     110001|  202101|                                      1515|                       243899.0|
|     110002|  202101|                                      3928|                       594415.0|
|     110003|  202101|                                       190|                        31455.0|
|     110004|  202101|                                      3115|                       522167.0|
|     110005|  202101|                                       710|                       106890.0|
|     110006|  202101|                                       372|                        59947.0|
|     110007|  202101|                                       379|                        59521.0|
|     110008|  20210

In [6]:
# Retirado de: https://medium.com/@salibi/como-validar-o-c%C3%B3digo-de-munic%C3%ADpio-do-ibge-90dc545cc533#:~:text=O%20C%C3%B3digo%20de%20Munic%C3%ADpio%20do%20IBGE%20%C3%A9%20um%20identificador%20%C3%BAnico,o%20%C3%BAltimo%20d%C3%ADgito%2C%20um%20verificador.

def last_digit_ibge(cod6: str):
   city_exceptions = {
                      '220191': "2201919",
                      '290630': "2202251",
                      '220198': "2201988",
                      '261153': "2611533",
                      '311783': "3117836",
                      '315213': "3152131",
                      '430587': "4305871",
                      '520393': "5203939",
                      '520396': "5203962",
                     }
   
   if cod6 in city_exceptions:
      return city_exceptions.get(cod6)

   a = int(cod6[0])
   b = (int(cod6[1]) * 2) % 10 + (int(cod6[1]) * 2) // 10
   c = int(cod6[2])
   d = (int(cod6[3]) * 2) % 10 + (int(cod6[3]) * 2) // 10
   e = int(cod6[4])
   f = (int(cod6[5]) * 2) % 10 + (int(cod6[5]) * 2) // 10
   digit = (10 - (a + b + c + d + e + f) % 10) % 10
   
   return cod6 + str(digit)



### Criando coluna de média para dados antes de 2023

In [10]:
for index, ano_csv in enumerate(dataframes_list):
    if 'pbf_vlr_medio_benef_f' not in ano_csv.columns:
        valor_medio_bolsa = ano_csv.valor_repassado_bolsa_familia_s / ano_csv.qtd_familias_beneficiarias_bolsa_familia_s
        print(valor_medio_bolsa)
        
        dataframes_list[index] = ano_csv.withColumn('pbf_vlr_medio_benef_f', valor_medio_bolsa)


Column<'(valor_repassado_bolsa_familia_s / qtd_familias_beneficiarias_bolsa_familia_s)'>
Column<'(valor_repassado_bolsa_familia_s / qtd_familias_beneficiarias_bolsa_familia_s)'>
Column<'(valor_repassado_bolsa_familia_s / qtd_familias_beneficiarias_bolsa_familia_s)'>
Column<'(valor_repassado_bolsa_familia_s / qtd_familias_beneficiarias_bolsa_familia_s)'>
Column<'(valor_repassado_bolsa_familia_s / qtd_familias_beneficiarias_bolsa_familia_s)'>


### Criando coluna para UF

In [None]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

def cria_coluna_uf(codigo_ibge):
    return  uf_dict.get(str(codigo_ibge)[0:2])

cria_coluna_uf_udf = udf(cria_coluna_uf, StringType())

df = df.withColumn("uf", cria_coluna_uf_udf(col("codigo_ibge")))


### Criando coluna para Município

In [None]:
def cria_coluna_municipio(codigo_ibge):
    return  municipios_dict.get(last_digit_ibge(str(codigo_ibge)))

cria_coluna_municipio_udf = udf(cria_coluna_municipio, StringType())

df = df.withColumn("municipio", cria_coluna_municipio_udf(col("codigo_ibge")))

### Criando coluna para Ano

In [None]:
def cria_coluna_ano(anomes_s):
    return  str(anomes_s)[0:4]

cria_coluna_ano_udf = udf(cria_coluna_ano, StringType())

df = df.withColumn("ano", cria_coluna_ano_udf(col("anomes_s")))

### Adicionando colunas UF, Município e Ano

In [None]:
for index, ano_csv in enumerate(dataframes_list):
    ano_csv = ano_csv.withColumn("uf", cria_coluna_uf_udf(col("codigo_ibge")))
    ano_csv = ano_csv.withColumn("municipio", cria_coluna_municipio_udf(col("codigo_ibge")))
    ano_csv = ano_csv.withColumn("ano", cria_coluna_ano_udf(col("anomes_s")))

    dataframes_list[index] = ano_csv

for ano_csv in dataframes_list:
    ano_csv.show()

In [None]:
dataframe_unificado =  dataframes_list[0]

for dataframe in dataframes_list[1:]:
    dataframe_unificado = dataframe_unificado.unionAll(dataframe)

dataframe_unificado.show()

dataframe_unificado = dataframe_unificado.coalesce(1)

output_path = "output/unified_data.csv"
dataframe_unificado.write.mode("overwrite").csv(output_path, header=True)
# Stop the SparkSession
spark_session.stop()

### Conversão do dataframe para o modelo estrela

In [None]:
import pyspark.sql.functions as F
from typing import Dict, List
from pyspark.sql.dataframe import DataFrame

def get_columns_list_from_dimension(dimension: Dict[str, List[str]]):
    return [col for cols in dimension for col in cols]

def get_table_name_and_records(dataframe: DataFrame, dimension_table_name_and_columns: Dict[str, List[str]]) -> List[tuple[str, DataFrame]]:
    dimensions = []

    for dimension_table_name, dimension_columns in dimension_table_name_and_columns:
        dimension_records = dataframe.select(*dimension_columns).distinct()
        surrogate_key_column_name = f"sk_{dimension_table_name.replace('dim_', '')}"

        # add unique and increasing id to dimension (but not consecutive)
        unique_and_increasing_id = F.monotonically_increasing_id()
        dimension_records = dimension_records.withColumn(
            surrogate_key_column_name,
            unique_and_increasing_id
        )

        dimension_table_in_tuple = (dimension_table_name, dimension_records)

        dimensions.append(dimension_table_in_tuple)
    
    return dimensions


def transform_spark_dataframe_into_star_schema(
    original_dataframe: DataFrame,
    fact_columns: List[str]  = ["col1", "col2"],
    fact_table_name = "tabela_fato",
    mapping_dimension_columns: Dict[str, List[str]] = {'dim1':["col3", "col4"], "dim2":["col5", "col6"]},
):
    dimension_columns_separated_by_dimension = mapping_dimension_columns.values()

    dimension_columns = get_columns_list_from_dimension(dimension_columns_separated_by_dimension)

    columns_from_fact_and_dimension = fact_columns + dimension_columns

    original_dataframe = original_dataframe.select(*columns_from_fact_and_dimension)

    dimension_table_name_and_columns = mapping_dimension_columns.items()

    dimensions = get_table_name_and_records(original_dataframe, dimension_table_name_and_columns)

    # Substitui as colunas de dimensão pelo respectivo SK na tabela fato
    # ------------------------------------------------------------------
    for table_name, records in dimensions:
        # join the dimension dataframe to the original dataframe
        dimension_columns_by_dimension_from_dataframe = [
            original_dataframe[column] == records[column]
            for column in mapping_dimension_columns[table_name]
        ]
        
        original_dataframe = original_dataframe.join(
            F.broadcast(records), 
            on=dimension_columns_by_dimension_from_dataframe,
            how="left"
        )

    # drop the original columns
    original_dataframe = original_dataframe.drop(*dimension_columns)

    fact_table = (fact_table_name, original_dataframe)
    
    return dimensions + [fact_table]

In [None]:
star_schema = transform_spark_dataframe_into_star_schema(
    selected_columns_df,
    fact_columns=["QT_VOTOS_NOMINAIS_VALIDOS", "QT_VOTOS_NOMINAIS"],
    fact_table_name="tabela_fato",
    mapping_dimension_columns={
        'dim_municipio': ["SG_UF", "NM_MUNICIPIO"],
        'dim_cargo': ["DS_CARGO"],
        'dim_ds_eleicao':["DS_ELEICAO"],
        'dim_partido':["SG_PARTIDO","NM_PARTIDO", "NR_PARTIDO"],
        'dim_candidato':["NM_CANDIDATO", "NR_CANDIDATO", "NM_URNA_CANDIDATO"],
        'dim_turno':["NR_TURNO"],
        'dim_tp_agrangencia':["TP_ABRANGENCIA"],
        'dim_zona':["NR_ZONA"],
        'dim_situacao_candidatura':["DS_SITUACAO_CANDIDATURA"],
        'dim_coligacao':["NM_COLIGACAO", "DS_COMPOSICAO_COLIGACAO"],
        "dim_voto_transito":["ST_VOTO_EM_TRANSITO"],
        'dim_situacaof_turno':["DS_SIT_TOT_TURNO"],
        'dim_destinacao_voto':["NM_TIPO_DESTINACAO_VOTOS"]
    },   
)

### Configurando conexão com o banco

In [None]:
hostname_or_ip = "34.172.175.190"
port = "443"
db = "metabase"
user = "star"
password = "star"

db_url = "jdbc:postgresql://" + hostname_or_ip + ":" + port + "/" + db

properties = {
    "user": user,
    "password": password,
    "driver": "org.postgresql.Driver", 
}


### Transferindo modelo estrela para o banco

In [None]:
for item in star_schema:
    table_name,dataframe = item
    print(f"Writing {table_name} to Eleicoes DB")
    if table_name == "dim_municipio":
        dataframe.write.jdbc(url=db_url, table=table_name, mode="overwrite", properties=properties)

### Desalocando sessão do spark


In [None]:
# Stopping spark session
spark_session.stop()

# Cleaning up files 
# Delete the directory and all its contents
# import shutil

# shutil.rmtree(workdir+'extracted/')