In [13]:
import requests
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

In [3]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("nome", StringType(), True),
    StructField("microrregiao", StructType([
        StructField("id", IntegerType(), True),
        StructField("nome", StringType(), True),
        StructField("mesorregiao", StructType([
            StructField("id", IntegerType(), True),
            StructField("nome", StringType(), True),
            StructField("UF", StructType([
                StructField("id", IntegerType(), True),
                StructField("sigla", StringType(), True),
                StructField("nome", StringType(), True),
                StructField("regiao", StructType([
                    StructField("id", IntegerType(), True),
                    StructField("sigla", StringType(), True),
                    StructField("nome", StringType(), True)
                ]))
            ]))
        ]))
    ])),
    StructField("regiao-imediata", StructType([
        StructField("id", IntegerType(), True),
        StructField("nome", StringType(), True),
        StructField("regiao-intermediaria", StructType([
            StructField("id", IntegerType(), True),
            StructField("nome", StringType(), True),
            StructField("UF", StructType([
                StructField("id", IntegerType(), True),
                StructField("sigla", StringType(), True),
                StructField("nome", StringType(), True),
                StructField("regiao", StructType([
                    StructField("id", IntegerType(), True),
                    StructField("sigla", StringType(), True),
                    StructField("nome", StringType(), True)
                ]))
            ]))
        ]))
    ]))
])

In [4]:
url = "https://servicodados.ibge.gov.br/api/v1/localidades/municipios"
response = requests.get(url)
j_resp = json.loads(response.text)

In [5]:
df_municipio = spark.createDataFrame(data=j_resp, schema=schema)

In [11]:
df_municipio.show(15, False)
print(df_municipio.dtypes)

+-------+------------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|id     |nome                    |microrregiao                                                                            |regiao-imediata                                                              |
+-------+------------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|1100015|Alta Floresta D'Oeste   |{11006, Cacoal, {1102, Leste Rondoniense, {11, RO, Rondônia, {1, N, Norte}}}}           |{110005, Cacoal, {1102, Ji-Paraná, {11, RO, Rondônia, {1, N, Norte}}}}       |
|1100023|Ariquemes               |{11003, Ariquemes, {1102, Leste Rondoniense, {11, RO, Rondônia, {1, N, Norte}}}}        |{110002, Ariquemes, {1101, Porto Velho, {11, RO, Rondônia, {1, N, Nor

In [8]:
df_municipio2 = df_municipio \
    .withColumnRenamed("regiao-imediata", "regiao_imediata") \
    .withColumn("regiao_imediata", 
                col("regiao_imediata").cast("struct<id:int,nome:string,regiao_intermediaria:struct<id:int,nome:string,UF:struct<id:int,sigla:string,nome:string,regiao:struct<id:int,sigla:string,nome:string>>>>"))

# Selecionar apenas os campos desejados
df_municipio3 = df_municipio2.select("id", "nome", "microrregiao", "regiao_imediata")


In [9]:
df_municipio3.show(15, False)
print(df_municipio3.dtypes)

+-------+------------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|id     |nome                    |microrregiao                                                                            |regiao_imediata                                                              |
+-------+------------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|1100015|Alta Floresta D'Oeste   |{11006, Cacoal, {1102, Leste Rondoniense, {11, RO, Rondônia, {1, N, Norte}}}}           |{110005, Cacoal, {1102, Ji-Paraná, {11, RO, Rondônia, {1, N, Norte}}}}       |
|1100023|Ariquemes               |{11003, Ariquemes, {1102, Leste Rondoniense, {11, RO, Rondônia, {1, N, Norte}}}}        |{110002, Ariquemes, {1101, Porto Velho, {11, RO, Rondônia, {1, N, Nor

In [18]:
# (df_municipio
#  .write
#  .format('parquet')
#  .mode('overwrite')
#  .save('s3a://bronze/ibge/municipio')
#  )