In [3]:
import requests
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType

In [4]:
spark = (SparkSession.builder
         .config("spark.jars","""/home/jovyan/jars/aws-java-sdk-core-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-dynamodb-1.11.534.jar,
                                 /home/jovyan/jars/aws-java-sdk-s3-1.11.534.jar,
                                 /home/jovyan/jars/hadoop-aws-3.2.2.jar""")
         .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
         .config("spark.hadoop.fs.s3a.access.key", "aulafia")
         .config("spark.hadoop.fs.s3a.secret.key", "aulafia@123")
         .config("spark.hadoop.fs.s3a.path.style.access", True)
         .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
         .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
         .getOrCreate()
        )

In [5]:
schema = StructType([
    StructField('id', StringType(), True),
    StructField('nome', StringType(), True),
    StructField('regiao', StructType([
        StructField('nome', StringType(), True),
        StructField('sigla', StringType(), True),
        StructField('id', StringType(), True)
    ]), True),
    StructField('sigla', StringType(), True)
])

In [6]:
url = "https://servicodados.ibge.gov.br/api/v1/localidades/estados"
response = requests.get(url)
j_resp = json.loads(response.text)

In [7]:
df_estado = spark.createDataFrame(data=j_resp, schema=schema)

In [9]:
df_estado.show(5, False)

+---+--------+-------------+-----+
|id |nome    |regiao       |sigla|
+---+--------+-------------+-----+
|11 |Rondônia|{Norte, N, 1}|RO   |
|12 |Acre    |{Norte, N, 1}|AC   |
|13 |Amazonas|{Norte, N, 1}|AM   |
|14 |Roraima |{Norte, N, 1}|RR   |
|15 |Pará    |{Norte, N, 1}|PA   |
+---+--------+-------------+-----+
only showing top 5 rows



In [10]:
(df_estado
 .write
 .format('parquet')
 .mode('overwrite')
 .save('s3a://bronze/ibge/estado')
 )