# Imports para Spark

In [2]:
from pyspark.sql import HiveContext
from pyspark.sql.types import *
sc = spark.sparkContext
sqlContext = HiveContext(sc)
sqlContext.setConf("hive.exec.dynamic.partition", "true")
sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
sqlContext.setConf("hive.exec.max.dynamic.partitions","6000")
sqlContext.setConf("hive.exec.max.dynamic.partitions.pernode","256");

# Operações HDFS 

In [1]:
!hdfs dfs -ls /datasus

Found 4 items
-rw-r--r--   2 root supergroup   62492959 2021-07-16 20:15 /datasus/HIST_PAINEL_COVIDBR_2020_Parte1_06jul2021.csv
-rw-r--r--   2 root supergroup   76520681 2021-07-16 20:15 /datasus/HIST_PAINEL_COVIDBR_2020_Parte2_06jul2021.csv
-rw-r--r--   2 root supergroup   91120916 2021-07-16 20:15 /datasus/HIST_PAINEL_COVIDBR_2021_Parte1_06jul2021.csv
-rw-r--r--   2 root supergroup    3046774 2021-07-16 20:15 /datasus/HIST_PAINEL_COVIDBR_2021_Parte2_06jul2021.csv


In [2]:
!hdfs dfs -ls /user/hive/warehouse/

Found 2 items
drwxrwxr-x   - root supergroup          0 2021-07-17 12:43 /user/hive/warehouse/covid19
drwxrwxr-x   - root supergroup          0 2021-07-16 23:27 /user/hive/warehouse/primeiravisualizacao


In [None]:
!hdfs dfs -copyFromLocal /input/*.csv /datasus/

!hdfs dfs -rm /datasus/*.csv

In [11]:
!hdfs dfs -rm -R /user/hive/warehouse/covid19

Deleted /user/hive/warehouse/covid19


!hdfs dfs -rm -R /user/hive/warehouse/covid19_particionada_codmun/*

# Spark context

In [None]:
spark.sql("use datasus").show()

In [1]:
spark.sql("show tables").show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|             covid19|      false|
| default|primeiravisualizacao|      false|
+--------+--------------------+-----------+



In [None]:
spark.sql("show databases").show()

df = spark.read.csv("/datasus/HIST_PAINEL_COVIDBR_2020_Parte1_06jul2021.csv", header="true", sep=";", quote="\'", inferSchema=True)

df = spark.read.csv("/datasus/HIST_PAINEL_COVIDBR_2020_Parte2_06jul2021.csv", header="true", sep=";", quote="\'", inferSchema=True)

df = spark.read.csv("/datasus/HIST_PAINEL_COVIDBR_2021_Parte1_06jul2021.csv", header="true", sep=";", quote="\'", inferSchema=True)

df = spark.read.csv("/datasus/HIST_PAINEL_COVIDBR_2021_Parte2_06jul2021.csv", header="true", sep=";", quote="\'", inferSchema=True)

In [None]:
df = spark.read.csv("/datasus/", header="true", sep=";", quote="\'", inferSchema=True)

In [None]:
df = df.na.fill(value=0)

In [None]:
df.createOrReplaceTempView("covid19_temporary") 

In [None]:
spark.sql("create table covid19_opt as select * from covid19_temporary");

df = spark.read.table('covid19')

In [None]:
df.count()

In [None]:
df.show(n=20, truncate=False, vertical=True )

In [None]:
df.printSchema()

df.na.fill(0, ["codmun"]).show()

df.na.fill(value=0,subset=["codmun"]).show()

In [None]:
df = df.na.fill(value=0)

# register as a temporary view [sql]
df.createOrReplaceTempView("df_view")

df.write.partitionBy("codmun") \
        .mode("append") \
        .save("hdfs://namenode:8020/datasus/covid19_particionada_codmun2")

df.write.save("hdfs://namenode:8020/datasus/covid19_particionada_codmun3")

df.write.partitionBy("codmun").saveAsTable("covid19_particionada_codmun")

spark.sql("SELECT codmun,count FROM df_view")

In [None]:
topic_string = df.select("regiao","coduf","codmun")
topic_string.show(truncate=False)

In [6]:
spark.sql("drop table covid19")
spark.sql("CREATE TABLE covid19 (regiao string,estado string,municipio string,coduf int,codRegiaoSaude int,nomeRegiaoSaude string,data string,semanaEpi int,populacaoTCU2019 int,casosAcumulado int,casosNovos int,obitosAcumulado int,obitosNovos int,Recuperadosnovos int,emAcompanhamentoNovos int,interiorMetropolitana string) PARTITIONED BY (codmun int) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LINES TERMINATED BY '\n' STORED AS TEXTFILE")

DataFrame[]

In [10]:
df = spark.sql("select * from covid19")
df.printSchema

<bound method DataFrame.printSchema of DataFrame[regiao: string, estado: string, municipio: string, coduf: int, codRegiaoSaude: int, nomeRegiaoSaude: string, data: string, semanaEpi: int, populacaoTCU2019: int, casosAcumulado: int, casosNovos: int, obitosAcumulado: int, obitosNovos: int, Recuperadosnovos: int, emAcompanhamentoNovos: int, interiorMetropolitana: string, codmun: int]>

In [None]:
df.write.partitionBy("codmun") \
        .mode("overwrite") \
        .save("covid19_particionada_codmun")

In [None]:
spark.sql("SELECT count(*) FROM covid19.parquet c19").show()

# primeira visualização

In [None]:
from default.covid19_opt insert overwrite table datasus.covid19 partition(codmun) select regiao,estado,municipio,        coduf,        codregiaosaude,        nomeregiaosaude,        data,        semanaepi,       populacaotcu2019,        casosacumulado,        casosnovos,        obitosacumulado,        obitosnovos,        recuperadosnovos,        emacompanhamentonovos,        interiorMetropolitana,        COALESCE(codmun, 0) codmun

In [None]:
spark.sql("from default.covid19_opt insert overwrite table datasus.covid19 partition(codmun) select regiao,estado,municipio, coduf, codregiaosaude,        nomeregiaosaude,        data,        semanaepi,       populacaotcu2019,        casosacumulado,        casosnovos,        obitosacumulado, obitosnovos, recuperadosnovos, emacompanhamentonovos, `interior/metropolitana`, COALESCE(codmun, 0) codmun  ")

In [None]:
col_list = df.select('mycol').collect()

In [None]:
spark.sql("SELECT data,count(1),sum(c19.Recuperadosnovos),sum(c19.emAcompanhamentoNovos) FROM covid19 c19 where c19.regiao like 'Brasil%' group by data order by data").show(400)

In [None]:
spark.sql("SELECT c19.regiao,count(1) FROM default.covid19 c19 group by c19.regiao").show()

In [None]:
spark.sql("SELECT c19.municipio,c19.estado,c19.codmun,c19.Recuperadosnovos,c19.emAcompanhamentoNovos FROM covid19 c19 where c19.codmun = 150300").show(10000)

In [None]:
spark.sql("SELECT c19.municipio,count(1) FROM covid19 c19 group by c19.municipio").show(10000)

# segunda visualização