In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [2]:

#Criando uma session com acesso ao gcp
spark = (
    SparkSession
    .builder
    .appName("Desafio-EDC-Extracao")
    .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
    .config("spark.jars","./jars/gcs-connector-hadoop3-latest.jar")
    .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
    .config("fs.AbstractFileSystem.gs.impl","com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    .config("fs.gs.auth.service.account.enable","true")
    .config("fs.gs.auth.service.account.json.keyfile", "/mnt/d/EngDados/gcp/gcp-estudos-engdados-20f2cdfffed8.json")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

21/09/12 22:49:44 WARN Utils: Your hostname, eduney-pc resolves to a loopback address: 127.0.1.1; using 172.26.68.160 instead (on interface eth0)
21/09/12 22:49:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/09/12 22:49:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/09/12 22:49:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Leitura dos dados de CNAE - Atividade Economica

In [3]:
schema_cnae = (
    StructType([
        StructField("cod_atividade", StringType(),True ),
        StructField("desc_atividade", StringType(), True)
        ])
    )

In [4]:
df_cnae = spark.read.csv("gs://desafio-final/F.K03200$Z.D10710.CNAE.csv",
                         sep=";",
                         header=False,
                         encoding="ISO-8859-1",
                         schema=schema_cnae)

In [5]:
df_cnae.limit(5).toPandas()



Unnamed: 0,cod_atividade,desc_atividade
0,111301,Cultivo de arroz
1,111302,Cultivo de milho
2,111303,Cultivo de trigo
3,111399,Cultivo de outros cereais não especificados an...
4,112101,Cultivo de algodão herbáceo


In [6]:
df_cnae.write.mode("overwrite").parquet("gs://bootcamp-edc/raw/cnae.parquet")



## Leitura dos dados de Município

In [7]:
del(df_cnae)

In [8]:
schema_municipio = (
    StructType([
        StructField("cod_municipio", StringType(), True),
        StructField("desc_municipio", StringType(), True)
        ])
    )

In [9]:
df_municipio = spark.read.csv("gs://desafio-final/F.K03200$Z.D10710.MUNIC.csv",
                              sep=";",
                              header=False,
                              encoding="ISO-8859-1",
                              schema=schema_municipio)

In [10]:
df_municipio.limit(5).toPandas()

Unnamed: 0,cod_municipio,desc_municipio
0,1,GUAJARA-MIRIM
1,2,ALTO ALEGRE DOS PARECIS
2,3,PORTO VELHO
3,4,BURITIS
4,5,JI-PARANA


In [11]:
df_municipio.write.mode("overwrite").parquet("gs://bootcamp-edc/raw/municipio.parquet")



In [12]:
del(df_municipio)

## Leitura dos dados de estabelecimentos

In [13]:
schema_estabelecimento = (
    StructType([
        StructField("cnpj_basico", StringType(), True),
        StructField("cnpj_ordem", StringType(), True),
        StructField("cnpj_dig_verificador", StringType(), True),
        StructField("idc_matriz_filial", StringType(), True),
        StructField("nome_fantasia", StringType(), True),
        StructField("cod_situacao_cadastral", StringType(), True),
        StructField("data_situacao_cadastral", StringType(), True),
        StructField("motivo_situacao_cadastral", StringType(), True),
        StructField("nome_cidade_exterior", StringType(), True),
        StructField("cod_pais", StringType(), True),
        StructField("data_inicio_atividade", StringType(), True),
        StructField("cnae_fiscal_principal", StringType(), True),
        StructField("cnae_fiscal_secundaria", StringType(), True),
        StructField("tipo_logradouro", StringType(), True),
        StructField("nome_logradouro", StringType(), True),
        StructField("numero_logradouro", StringType(), True),
        StructField("complemento_logradouro", StringType(), True),
        StructField("nome_bairro", StringType(), True),
        StructField("cod_cep", StringType(), True),
        StructField("sigla_uf", StringType(), True),
        StructField("cod_municipio", StringType(), True),
        StructField("ddd_telefone_1", StringType(), True),
        StructField("num_telefone_1", StringType(), True),
        StructField("ddd_telefone_2", StringType(), True),
        StructField("num_telefone_2", StringType(), True),
        StructField("ddd_fax", StringType(), True),
        StructField("num_fax_2", StringType(), True),
        StructField("email", StringType(), True),
        StructField("situacao_especial", StringType(), True),
        StructField("data_situacao_especial", StringType(), True)
    ])
)

In [14]:
df_estabelecimento = spark.read.csv(
    "gs://desafio-final/estabelecimentos/K3241.K03200Y0.D10710.ESTABELE.csv",
    sep=";",
    header=False,
    encoding="ISO-8859-1",
    schema=schema_estabelecimento
).sample(0.1)

In [15]:
df_estabelecimento.limit(5).toPandas()

21/09/12 22:50:10 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,cnpj_basico,cnpj_ordem,cnpj_dig_verificador,idc_matriz_filial,nome_fantasia,cod_situacao_cadastral,data_situacao_cadastral,motivo_situacao_cadastral,nome_cidade_exterior,cod_pais,...,cod_municipio,ddd_telefone_1,num_telefone_1,ddd_telefone_2,num_telefone_2,ddd_fax,num_fax_2,email,situacao_especial,data_situacao_especial
0,26638420,1,91,1,NOBRE MADEIRAS DO BRASIL,4,20210212,63,,,...,9373,62,39881414,,,,,fmix.contabilidade@gmail.com,,
1,26638521,1,62,1,ALTAS HORAS BEBIDAS,4,20210413,63,,,...,6475,13,96918061,,,,,franciscosilva.1234@outlook.com,,
2,26638557,1,46,1,,2,20160801,0,,,...,6249,14,33541753,,,14.0,33541753.0,ALEXANDRE@VISAOCONTABILBTU.COM.BR,,
3,26638778,1,14,1,EDS PROJECTS,8,20200426,1,,,...,8801,51,97087878,,,,,EDSONDIASES@GMAIL.COM,,
4,26638891,1,8,1,BEM VESTIDA,4,20210310,63,,,...,7149,19,82713151,,,,,TECABELEZA@HOTMAIL.COM,,


In [16]:
df_estabelecimento.write.mode("overwrite").parquet("gs://bootcamp-edc/raw/estabelecimento.parquet")



In [17]:
spark.stop()