In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [None]:

#Criando uma session com acesso ao gcp
spark = (
    SparkSession
    .builder
    .appName("Desafio-EDC-Extracao")
    .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
    .config("spark.jars","./jars/gcs-connector-hadoop3-latest.jar")
    .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
    .config("fs.AbstractFileSystem.gs.impl","com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    .config("fs.gs.auth.service.account.enable","true")
    .config("fs.gs.auth.service.account.json.keyfile", "/mnt/d/EngDados/gcp/gcp-estudos-engdados-20f2cdfffed8.json")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

## Leitura dos dados de CNAE - Atividade Economica

In [None]:
schema_cnae = (
    StructType([
        StructField("cod_atividade", StringType(),True ),
        StructField("desc_atividade", StringType(), True)
        ])
    )

In [None]:
df_cnae = spark.read.csv("gs://desafio-final/F.K03200$Z.D10710.CNAE.csv",
                         sep=";",
                         header=False,
                         encoding="ISO-8859-1",
                         schema=schema_cnae)

In [None]:
df_cnae.limit(5).toPandas()

In [None]:
df_cnae.write.mode("overwrite").parquet("gs://bootcamp-edc/raw/cnae.parquet")

## Leitura dos dados de Município

In [None]:
del(df_cnae)

In [None]:
schema_municipio = (
    StructType([
        StructField("cod_municipio", StringType(), True),
        StructField("desc_municipio", StringType(), True)
        ])
    )

In [None]:
df_municipio = spark.read.csv("gs://desafio-final/F.K03200$Z.D10710.MUNIC.csv",
                              sep=";",
                              header=False,
                              encoding="ISO-8859-1",
                              schema=schema_municipio)

In [None]:
df_municipio.limit(5).toPandas()

In [None]:
df_municipio.write.mode("overwrite").parquet("gs://bootcamp-edc/raw/municipio.parquet")

In [None]:
del(df_municipio)

## Leitura dos dados de estabelecimentos

In [None]:
schema_estabelecimento = (
    StructType([
        StructField("cnpj_basico", StringType(), True),
        StructField("cnpj_ordem", StringType(), True),
        StructField("cnpj_dig_verificador", StringType(), True),
        StructField("idc_matriz_filial", StringType(), True),
        StructField("nome_fantasia", StringType(), True),
        StructField("cod_situacao_cadastral", StringType(), True),
        StructField("data_situacao_cadastral", StringType(), True),
        StructField("motivo_situacao_cadastral", StringType(), True),
        StructField("nome_cidade_exterior", StringType(), True),
        StructField("cod_pais", StringType(), True),
        StructField("data_inicio_atividade", StringType(), True),
        StructField("cnae_fiscal_principal", StringType(), True),
        StructField("cnae_fiscal_secundaria", StringType(), True),
        StructField("tipo_logradouro", StringType(), True),
        StructField("nome_logradouro", StringType(), True),
        StructField("numero_logradouro", StringType(), True),
        StructField("complemento_logradouro", StringType(), True),
        StructField("nome_bairro", StringType(), True),
        StructField("cod_cep", StringType(), True),
        StructField("sigla_uf", StringType(), True),
        StructField("cod_municipio", StringType(), True),
        StructField("ddd_telefone_1", StringType(), True),
        StructField("num_telefone_1", StringType(), True),
        StructField("ddd_telefone_2", StringType(), True),
        StructField("num_telefone_2", StringType(), True),
        StructField("ddd_fax", StringType(), True),
        StructField("num_fax_2", StringType(), True),
        StructField("email", StringType(), True),
        StructField("situacao_especial", StringType(), True),
        StructField("data_situacao_especial", StringType(), True)
    ])
)

In [None]:
df_estabelecimento = spark.read.csv(
    "gs://desafio-final/estabelecimentos/K3241.K03200Y0.D10710.ESTABELE.csv",
    sep=";",
    header=False,
    encoding="ISO-8859-1",
    schema=schema_estabelecimento
).sample(0.1)

In [None]:
df_estabelecimento.limit(5).toPandas()

In [None]:
df_estabelecimento.write.mode("overwrite").parquet("gs://bootcamp-edc/raw/estabelecimento.parquet")

In [None]:
spark.stop()