## Projeto Covid

### 1. Enviar os dados para o hdfs

In [1]:
#criando estrutura de diretório
#!hdfs dfs -mkdir -p /user/clayton/projeto/data/csv
#docker cp [arquivo.csv]  namenode:/
#!hdfs dfs -put *.csv /user/clayton/projeto/data/csv/
!hdfs dfs -ls /user/clayton/projeto/data/csv/

Found 4 items
-rw-r--r--   3 root supergroup   62492959 2022-04-20 15:32 /user/clayton/projeto/data/csv/HIST_PAINEL_COVIDBR_2020_Parte1_06jul2021.csv
-rw-r--r--   3 root supergroup   76520681 2022-04-20 15:32 /user/clayton/projeto/data/csv/HIST_PAINEL_COVIDBR_2020_Parte2_06jul2021.csv
-rw-r--r--   3 root supergroup   91120916 2022-04-20 15:32 /user/clayton/projeto/data/csv/HIST_PAINEL_COVIDBR_2021_Parte1_06jul2021.csv
-rw-r--r--   3 root supergroup    3046774 2022-04-20 15:32 /user/clayton/projeto/data/csv/HIST_PAINEL_COVIDBR_2021_Parte2_06jul2021.csv


### 2. Otimizar todos os dados do hdfs para uma tabela Hive particionada por município.

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
#atribuindo os aquivos csv para a variavel data_covid
data_covid = spark.read.csv("/user/clayton/projeto/data/csv/*.csv", sep=";", header=True, ignoreTrailingWhiteSpace=True, inferSchema=True )


In [4]:
#verificando Schema
data_covid.printSchema()

root
 |-- regiao: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- municipio: string (nullable = true)
 |-- coduf: integer (nullable = true)
 |-- codmun: integer (nullable = true)
 |-- codRegiaoSaude: integer (nullable = true)
 |-- nomeRegiaoSaude: string (nullable = true)
 |-- data: timestamp (nullable = true)
 |-- semanaEpi: integer (nullable = true)
 |-- populacaoTCU2019: integer (nullable = true)
 |-- casosAcumulado: decimal(10,0) (nullable = true)
 |-- casosNovos: integer (nullable = true)
 |-- obitosAcumulado: integer (nullable = true)
 |-- obitosNovos: integer (nullable = true)
 |-- Recuperadosnovos: integer (nullable = true)
 |-- emAcompanhamentoNovos: integer (nullable = true)
 |-- interior/metropolitana: integer (nullable = true)



In [5]:
#verificar dados
data_covid.show(2)

+------+------+---------+-----+------+--------------+---------------+-------------------+---------+----------------+--------------+----------+---------------+-----------+----------------+---------------------+----------------------+
|regiao|estado|municipio|coduf|codmun|codRegiaoSaude|nomeRegiaoSaude|               data|semanaEpi|populacaoTCU2019|casosAcumulado|casosNovos|obitosAcumulado|obitosNovos|Recuperadosnovos|emAcompanhamentoNovos|interior/metropolitana|
+------+------+---------+-----+------+--------------+---------------+-------------------+---------+----------------+--------------+----------+---------------+-----------+----------------+---------------------+----------------------+
|Brasil|  null|     null|   76|  null|          null|           null|2020-02-25 00:00:00|        9|       210147125|             0|         0|              0|          0|            null|                 null|                  null|
|Brasil|  null|     null|   76|  null|          null|           null

In [61]:
#filtrando colunas necessárias, tratando dados nullos e alterando formato da data 
data_covid_filtrado = data_covid.select("data", "casosAcumulado","casosNovos", "obitosAcumulado", "obitosNovos", "Recuperadosnovos", "emAcompanhamentoNovos", "municipio", "populacaoTCU2019" )
data_covid_sem_nulos = data_covid_filtrado.na.fill({'municipio': '0', 'Recuperadosnovos':0, 'emAcompanhamentoNovos':0 })
data_covid_clean = data_covid_sem_nulos.withColumn("data", to_timestamp(col("data"))).withColumn("data",to_date(col("data")))
#data_covid_clean.filter(data_covid_clean.municipio != '0').show(truncate=False)
#df_covid = data_covid_clean.filter(data_covid_clean.municipio != '0')
#df_covid.show(2)


In [None]:
#data_covid_clean.filter(data_covid_clean.Recuperadosnovos > 0).show()

In [8]:
!hdfs dfs -ls /user/hive/warehouse/covid/

Found 5299 items
-rw-r--r--   2 root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/_SUCCESS
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=0
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abadia de Goiás
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abadia dos Dourados
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abadiânia
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abaetetuba
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abaeté
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abaiara
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abaré
drwxr-xr-x   - root supergroup          

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Bacabal
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Bacabeira
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Bacuri
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Bacurituba
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Bady Bassitt
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Baependi
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Bagre
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Bagé
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Baianópolis
drwxr-xr-x   - root supergroup          0 2022

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campina das Missões
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campina do Monte Alegre
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campina do Simão
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campinas
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campinas do Piauí
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campinas do Sul
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campinaçu
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campinorte
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Cam

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Conceição do Tocantins
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Conceição dos Ouros
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Conchal
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Conchas
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Concórdia
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Concórdia do Pará
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Condado
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Conde
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Condeúba
drwxr-xr-x   - ro

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Esmeraldas
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Espera Feliz
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Esperantina
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Esperantinópolis
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Esperança
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Esperança Nova
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Esperança do Sul
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Espigão Alto do Iguaçu
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Espigão D%

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Juína
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Jóia
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Júlio Borges
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Júlio Mesquita
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Júlio de Castilhos
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Kaloré
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Lacerdópolis
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ladainha
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ladário
drwxr-xr-x   - root supergroup  

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Piquet Carneiro
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Piquete
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Piracaia
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Piracanjuba
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Piracema
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Piracicaba
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Piracuruca
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Piraju
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Pirajuba
drwxr-xr-x   - root supergroup     

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=São Luiz do Paraitinga
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=São Luís
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=São Luís Gonzaga do Maranhão
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=São Luís de Montes Belos
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=São Luís do Curu
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=São Luís do Quitunde
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=São Mamede
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=São Manoel do Paraná
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hiv

In [13]:
#salvando a tabela no hive particionada por municipio
data_covid_clean.write.saveAsTable("covid", mode="OverWrite", partitionBy="municipio")

In [9]:
#visualizando dados no hdfs
!hdfs dfs -ls /user/hive/warehouse/covid

Found 5299 items
-rw-r--r--   2 root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/_SUCCESS
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=0
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abadia de Goiás
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abadia dos Dourados
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abadiânia
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abaetetuba
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abaeté
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abaiara
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Abaré
drwxr-xr-x   - root supergroup          

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Paraguai
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Paraná
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Paraíso
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Paraíso de Goiás
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Parnaíba
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Piquiri
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Rio Doce
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Rio Novo
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Alto Santo


drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Aracruz
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Aragarças
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Aragoiânia
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Aragominas
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Araguacema
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Araguaiana
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Araguainha
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Araguanã
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Araguapaz
drwxr-xr-x   - root supergroup     

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Camocim de São Félix
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campanha
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campanário
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campestre
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campestre da Serra
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campestre de Goiás
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campestre do Maranhão
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Campina Grande
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ca

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Dona Inês
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Dores de Campos
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Dores de Guanhães
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Dores do Indaiá
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Dores do Rio Preto
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Dores do Turvo
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Doresópolis
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Dormentes
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Douradina
d

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ipueiras
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ipuiúna
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ipumirim
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ipupiara
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ipuã
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Ipê
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Iracema
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Iracema do Oeste
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Iraceminha
drwxr-xr-x   - root supergroup          0 2022-

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova Tebas
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova Timboteua
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova Trento
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova Ubiratã
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova União
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova Veneza
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova Venécia
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova Viçosa
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Nova Xavantina
drwxr-xr-x   

drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Sapiranga
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Sapopema
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Sapucaia
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Sapucaia do Sul
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Sapucaí-Mirim
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Sapé
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Saquarema
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Sarandi
drwxr-xr-x   - root supergroup          0 2022-04-20 15:41 /user/hive/warehouse/covid/municipio=Sarapuí
drwxr-xr-x   - root supergroup        

In [10]:
#visualizando a tabela no hive
spark.read.table("covid").show(5)

+----------+--------------+----------+---------------+-----------+----------------+---------------------+---------+
|      data|casosAcumulado|casosNovos|obitosAcumulado|obitosNovos|Recuperadosnovos|emAcompanhamentoNovos|municipio|
+----------+--------------+----------+---------------+-----------+----------------+---------------------+---------+
|2021-01-01|       7700578|     24605|         195411|        462|         6756284|               748883|        0|
|2021-01-02|       7716405|     15827|         195725|        314|         6769420|               751260|        0|
|2021-01-03|       7733746|     17341|         196018|        293|         6813008|               724720|        0|
|2021-01-04|       7753752|     20006|         196561|        543|         6875230|               681961|        0|
|2021-01-05|       7810400|     56648|         197732|       1171|         6963407|               649261|        0|
+----------+--------------+----------+---------------+-----------+------

### 3. Criar as 3 vizualizações pelo Spark com os dados enviados para o HDFS: Casos Recuperados, Casos Confirmados, e Obitos confirmados

In [111]:
# 1 visualizacao - colocar .show() para visualizar a saida
df_casos_recuperados = data_covid_clean.select(max('Recuperadosnovos').alias("Casos_Recuperados"), \
                                                max('emAcompanhamentoNovos').alias("Em_Acompanhamento"))

In [112]:
# 2 visualizacao - colocar .show() para visualizar a saida
df_casos_confirmados = data_covid_clean.select(max('casosAcumulado').alias("Casos_Confirmados"), \
                                               max('casosNovos').alias("Casos_Novos"))

In [113]:
#taxa mortalidade = n de obitos x 1000 / n abitantes
# 526892 * 1000 / 210147125 = 250725
total_populacao = data_covid.select(max('populacaoTCU2019'))


In [114]:
# 3 visualizacao - colocar .show() para visualizar a saida
df_obitos_confirmados = data_covid_clean.select(max('obitosAcumulado').alias("Obitos_Confirmados"), \
                                                max('obitosNovos').alias("Obitos_Novos"))
df_obitos_confirmados_tx = df_obitos_confirmados.withColumn("Taxa_Mortalidade", lit(526892*1000/210147125))


### 4. Salvar a primeira visualização como tabela Hive

In [115]:
df_casos_recuperados.write.saveAsTable("table_casos_recuperados")

In [117]:
spark.read.table('table_casos_recuperados').show()

+-----------------+-----------------+
|Casos_Recuperados|Em_Acompanhamento|
+-----------------+-----------------+
|         17262646|          1317658|
+-----------------+-----------------+



### 5. Salvar a segunda visualização com formato parquet e compressão snappy

In [119]:
df_casos_confirmados.write.saveAsTable("table_casos_confirmados", format="parquet", compression="snappy")

In [123]:
spark.sql('''describe formatted table_casos_confirmados''').show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|   Casos_Confirmados|       decimal(10,0)|   null|
|         Casos_Novos|                 int|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|             default|       |
|               Table|table_casos_confi...|       |
|               Owner|                root|       |
|        Created Time|Wed Apr 20 19:40:...|       |
|         Last Access|Thu Jan 01 00:00:...|       |
|          Created By|         Spark 2.4.1|       |
|                Type|             MANAGED|       |
|            Provider|             parquet|       |
|    Table Properties|[transient_lastDd...|       |
|          Statistics|           713 bytes|       |
|            Location|hdfs://namenode:8...|       |
|       Serde Library|org.apache.hadoop...|       |
|         In

In [None]:
#6. Salvar a terceira visualização em um tópico no Kafka

In [None]:
#7. Criar a visualização pelo Spark com os dados enviados para o HDFS:
#8. Salvar a visualização do exercício 6 em um tópico no Elastic
#9. Criar um dashboard no Elastic para visualização dos novos dados enviados