# Imports para Spark

In [1]:
from pyspark.sql import HiveContext
from pyspark.sql.types import *

# Operações HDFS 

In [4]:
!hdfs dfs -ls /datasus

Found 4 items
-rw-r--r--   2 root supergroup   62492959 2021-07-11 19:09 /datasus/HIST_PAINEL_COVIDBR_2020_Parte1_06jul2021.csv
-rw-r--r--   2 root supergroup   76520681 2021-07-11 19:10 /datasus/HIST_PAINEL_COVIDBR_2020_Parte2_06jul2021.csv
-rw-r--r--   2 root supergroup   91120916 2021-07-11 19:10 /datasus/HIST_PAINEL_COVIDBR_2021_Parte1_06jul2021.csv
-rw-r--r--   2 root supergroup    3046774 2021-07-11 19:10 /datasus/HIST_PAINEL_COVIDBR_2021_Parte2_06jul2021.csv


In [None]:
!hdfs dfs -copyFromLocal /input/*.csv /datasus/

In [None]:
!hdfs dfs -rm /datasus/*.csv

In [3]:
!hdfs dfs -rm -R /datasus/covid19_*

Deleted /datasus/covid19_particionada_codmun2
Deleted /datasus/covid19_particionada_codmun3


In [None]:
!hdfs dfs -rm -R /user/hive/warehouse/covid19_particionada_codmun/*

# Spark context

In [1]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|  covid19|      false|
+--------+---------+-----------+



In [None]:
sc = spark.sparkContext

In [None]:
sqlContext = HiveContext(sc)

df = spark.read.csv("/datasus/HIST_PAINEL_COVIDBR_2020_Parte1_06jul2021.csv", header="true", sep=";", quote="\'", inferSchema=True)

df = spark.read.csv("/datasus/HIST_PAINEL_COVIDBR_2020_Parte2_06jul2021.csv", header="true", sep=";", quote="\'", inferSchema=True)

df = spark.read.csv("/datasus/HIST_PAINEL_COVIDBR_2021_Parte1_06jul2021.csv", header="true", sep=";", quote="\'", inferSchema=True)

df = spark.read.csv("/datasus/HIST_PAINEL_COVIDBR_2021_Parte2_06jul2021.csv", header="true", sep=";", quote="\'", inferSchema=True)

In [None]:
df = spark.read.csv("/datasus/", header="true", sep=";", quote="\'", inferSchema=True)

In [2]:
df = spark.read.table('covid19')

In [3]:
df.count()

714482

In [5]:
df.show(n=20, truncate=False, vertical=True )

-RECORD 0---------------------------------------
 regiao                | regiao                 
 estado                | estado                 
 municipio             | municipio              
 coduf                 | null                   
 codmun                | null                   
 codregiaosaude        | null                   
 nomeregiaosaude       | nomeRegiaoSaude        
 data                  | data                   
 semanaepi             | null                   
 populacaotcu2019      | null                   
 casosacumulado        | null                   
 casosnovos            | null                   
 obitosacumulado       | null                   
 obitosnovos           | null                   
 recuperadosnovos      | null                   
 emacompanhamentonovos | null                   
 interiormetropolitana | interior/metropolitana 
-RECORD 1---------------------------------------
 regiao                | Brasil                 
 estado             

In [None]:
df.printSchema()

df.na.fill(0, ["codmun"]).show()

df.na.fill(value=0,subset=["codmun"]).show()

In [None]:
df = df.na.fill(value=0)

# register as a temporary view [sql]
df.createOrReplaceTempView("df_view")

df.write.partitionBy("codmun") \
        .mode("append") \
        .save("hdfs://namenode:8020/datasus/covid19_particionada_codmun2")

df.write.save("hdfs://namenode:8020/datasus/covid19_particionada_codmun3")

df.write.partitionBy("codmun").saveAsTable("covid19_particionada_codmun")

spark.sql("SELECT codmun,count FROM df_view")

In [None]:
topic_string = df.select("regiao","coduf","codmun")
topic_string.show(truncate=False)

In [None]:
sqlContext.setConf("hive.exec.dynamic.partition", "true")
sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")

In [None]:
df.write.partitionBy("codmun") \
        .mode("append") \
        .saveAsTable("covid19_particionada_codmun")

In [None]:
spark.sql("SELECT * FROM covid19 c19").show()

# primeira visualização

In [13]:
spark.sql("SELECT data,count(1),sum(c19.Recuperadosnovos),sum(c19.emAcompanhamentoNovos) FROM covid19 c19 where c19.regiao like 'Brasil%' group by data order by data").show(400)

+----------+--------+---------------------+--------------------------+
|      data|count(1)|sum(Recuperadosnovos)|sum(emAcompanhamentoNovos)|
+----------+--------+---------------------+--------------------------+
|2020-02-25|       1|                 null|                      null|
|2020-02-26|       1|                 null|                      null|
|2020-02-27|       1|                 null|                      null|
|2020-02-28|       1|                 null|                      null|
|2020-02-29|       1|                 null|                      null|
|2020-03-01|       1|                 null|                      null|
|2020-03-02|       1|                 null|                      null|
|2020-03-03|       1|                 null|                      null|
|2020-03-04|       1|                 null|                      null|
|2020-03-05|       1|                 null|                      null|
|2020-03-06|       1|                 null|                      null|
|2020-

In [None]:
spark.sql("SELECT c19.regiao,count(1) FROM covid19 c19 group by c19.regiao").show()

In [None]:
spark.sql("SELECT c19.municipio,c19.estado,c19.codmun,c19.Recuperadosnovos,c19.emAcompanhamentoNovos FROM covid19 c19 where c19.codmun = 150300").show(10000)

In [None]:
spark.sql("SELECT c19.municipio,count(1) FROM covid19 c19 group by c19.municipio").show(10000)

# segunda visualização