In [1]:
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pyspark.sql.functions as fn
from pyspark.sql import SparkSession
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

In [3]:
df_estado = spark.read.format('parquet').load('s3a://bronze/ibge/estado')
print(df_estado.dtypes)
print(df_estado.count())
df_estado.show(12, False)

[('id', 'string'), ('nome', 'string'), ('regiao', 'struct<nome:string,sigla:string,id:string>'), ('sigla', 'string')]
27
+---+-------------------+-----------------+-----+
|id |nome               |regiao           |sigla|
+---+-------------------+-----------------+-----+
|11 |Rondônia           |{Norte, N, 1}    |RO   |
|12 |Acre               |{Norte, N, 1}    |AC   |
|13 |Amazonas           |{Norte, N, 1}    |AM   |
|14 |Roraima            |{Norte, N, 1}    |RR   |
|15 |Pará               |{Norte, N, 1}    |PA   |
|16 |Amapá              |{Norte, N, 1}    |AP   |
|17 |Tocantins          |{Norte, N, 1}    |TO   |
|21 |Maranhão           |{Nordeste, NE, 2}|MA   |
|22 |Piauí              |{Nordeste, NE, 2}|PI   |
|23 |Ceará              |{Nordeste, NE, 2}|CE   |
|24 |Rio Grande do Norte|{Nordeste, NE, 2}|RN   |
|25 |Paraíba            |{Nordeste, NE, 2}|PB   |
+---+-------------------+-----------------+-----+
only showing top 12 rows



In [4]:
df_municipio = spark.read.format('parquet').load('s3a://bronze/ibge/municipio')
print(df_municipio.dtypes)
print(df_municipio.count())
df_municipio.show(12, False)

[('id', 'bigint'), ('nome', 'string'), ('microrregiao', 'struct<id:bigint,nome:string,mesorregiao:struct<id:bigint,nome:string,UF:struct<id:int,sigla:string,nome:string,regiao:struct<id:int,sigla:string,nome:string>>>>'), ('regiao_imediata', 'struct<id:int,nome:string,regiao_intermediaria:struct<id:int,nome:string,UF:struct<id:int,sigla:string,nome:string,regiao:struct<id:int,sigla:string,nome:string>>>>')]
5570
+-------+---------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|id     |nome                 |microrregiao                                                                            |regiao_imediata                                                              |
+-------+---------------------+----------------------------------------------------------------------------------------+------------------------------------------------------------------

In [5]:
df_educacao = spark.read.format('parquet').load('s3a://bronze/inep/educacao_superior/indicadores_fluxo_educacao_superior')
print(df_municipio.dtypes)
print(df_municipio.count())
df_municipio.show(5, False)

[('id', 'bigint'), ('nome', 'string'), ('microrregiao', 'struct<id:bigint,nome:string,mesorregiao:struct<id:bigint,nome:string,UF:struct<id:int,sigla:string,nome:string,regiao:struct<id:int,sigla:string,nome:string>>>>'), ('regiao_imediata', 'struct<id:int,nome:string,regiao_intermediaria:struct<id:int,nome:string,UF:struct<id:int,sigla:string,nome:string,regiao:struct<id:int,sigla:string,nome:string>>>>')]
5570
+-------+---------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|id     |nome                 |microrregiao                                                                            |regiao_imediata                                                              |
+-------+---------------------+----------------------------------------------------------------------------------------+------------------------------------------------------------------

<b>Camada Bronze - bronze/ibge/estado</b>

In [12]:
df_estado = spark.read.format('parquet').load('s3a://bronze/ibge/estado')
df_estado.show(5, False)

+---+--------+-------------+-----+
|id |nome    |regiao       |sigla|
+---+--------+-------------+-----+
|11 |Rondônia|{Norte, N, 1}|RO   |
|12 |Acre    |{Norte, N, 1}|AC   |
|13 |Amazonas|{Norte, N, 1}|AM   |
|14 |Roraima |{Norte, N, 1}|RR   |
|15 |Pará    |{Norte, N, 1}|PA   |
+---+--------+-------------+-----+
only showing top 5 rows



<b>Camada Bronze - bronze/ibge/municipio</b>

In [13]:
df_municipio = spark.read.format('parquet').load('s3a://bronze/ibge/municipio')
df_municipio.show(5, False)

+-------+---------------------+----------------------------------------------------------------------------------------+---------------------------------------------------------------------------+
|id     |nome                 |microrregiao                                                                            |regiao_imediata                                                            |
+-------+---------------------+----------------------------------------------------------------------------------------+---------------------------------------------------------------------------+
|1100015|Alta Floresta D'Oeste|{11006, Cacoal, {1102, Leste Rondoniense, {11, RO, Rondônia, {1, N, Norte}}}}           |{110005, Cacoal, {1102, Ji-Paraná, {11, RO, Rondônia, {1, N, Norte}}}}     |
|1100023|Ariquemes            |{11003, Ariquemes, {1102, Leste Rondoniense, {11, RO, Rondônia, {1, N, Norte}}}}        |{110002, Ariquemes, {1101, Porto Velho, {11, RO, Rondônia, {1, N, Norte}}}}|
|1100031|Cabixi

<b>Camada Bronze - bronze/inep/educacao_superior/indicadores_fluxo_educacao_superior</b>

In [15]:
df_educacao = spark.read.format('parquet')\
 .load('s3a://bronze/inep/educacao_superior/indicadores_fluxo_educacao_superior')
df_educacao.show(5, False)

+------+-----------------------------------+---------------------------+------------------------+--------+--------+---------+-----+------------+-----------------+--------------------+--------------+--------------+------------------+---------------------------------+---------------+-----------------+-----------------------+---------------------+-----------------------+----------------------------+--------------+--------------+-------------+--------------+-----------+------------------+------------------+------------------+------------------+------------------+
|CO_IES|NO_IES                             |TP_CATEGORIA_ADMINISTRATIVA|TP_ORGANIZACAO_ACADEMICA|CO_CURSO|NO_CURSO|CO_REGIAO|CO_UF|CO_MUNICIPIO|TP_GRAU_ACADEMICO|TP_MODALIDADE_ENSINO|CO_CINE_ROTULO|NO_CINE_ROTULO|CO_CINE_AREA_GERAL|NO_CINE_AREA_GERAL               |NU_ANO_INGRESSO|NU_ANO_REFERENCIA|NU_PRAZO_INTEGRALIZACAO|NU_ANO_INTEGRALIZACAO|NU_PRAZO_ACOMPANHAMENTO|NU_ANO_MAXIMO_ACOMPANHAMENTO|QT_INGRESSANTE|QT_PERMANENCIA|QT_CO