In [16]:
# configura os nós de um cluster (nesse caso estamos rodando standalone)
from pyspark.sql import SparkSession

In [17]:
# configura os nós de um cluster (nesse caso estamos rodando standalone)
spark = SparkSession.builder.appName("corona-analisys").getOrCreate();

In [18]:
# lendo os dados de um CSV e permitindo que o Spark infira o tipo de dados. Também informamos que o arquivo contém um header
df = spark.read.csv('covid_19_data.csv', inferSchema=True, header=True)

In [19]:
# listando 3 linhas para ver se deu certo a importação do CSV
df.show(3)

+---+---------------+--------------+--------------+---------------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|    Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+---------------+---------+------+---------+
|  1|     01/22/2020|         Anhui|Mainland China|1/22/2020 17:00|      1.0|   0.0|      0.0|
|  2|     01/22/2020|       Beijing|Mainland China|1/22/2020 17:00|     14.0|   0.0|      0.0|
|  3|     01/22/2020|     Chongqing|Mainland China|1/22/2020 17:00|      6.0|   0.0|      0.0|
+---+---------------+--------------+--------------+---------------+---------+------+---------+
only showing top 3 rows



In [5]:
# importando SQLContect para poder manipular o Dataframe como se fosse uma Tabela SQL
from pyspark.sql import SQLContext

In [20]:
# createOrReplaceTempView -> criar uma visão temporária com o nome coronaTable (visão permite acessar os dados como uma tabela SQL)
df.createOrReplaceTempView("coronaTable")

In [21]:
#executa uma consulta SQL no dataframe (na verdade visão criada na linha acima)
sql = spark.sql("select * from coronaTable where `Province/State` is null")

In [22]:
#importando when e col para processar os dados da coluna flag
from pyspark.sql.functions import when,col

In [23]:
# criando outro DataFrame (sobrescrevendo ) o DataFrame anterior, alteramos a coluna 'Province/State' quando ela possuísse valor null para 'N/A', caso contrário, retornamos o valor já existente
df = df.withColumn( 'Province/State', when ( col ('Province/State').isNull() , 'N/A' ).otherwise(df['Province/State'])   )

In [24]:
# filtrando a coluna 'Province/State' para verificar se ainda possui valores nulos
df.filter ( df['Province/State'].isNull() ).show(1)

+---+---------------+--------------+--------------+-----------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+-----------+---------+------+---------+
+---+---------------+--------------+--------------+-----------+---------+------+---------+



In [25]:
#executa uma consulta SQL no dataframe (na verdade visão criada na linha acima)
sql = spark.sql("select * from coronaTable where `ObservationDate` is null").show(1)


+---+---------------+--------------+--------------+-----------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+-----------+---------+------+---------+
+---+---------------+--------------+--------------+-----------+---------+------+---------+



In [26]:
#executa uma consulta SQL no dataframe (na verdade visão criada na linha acima)
sql = spark.sql("select * from coronaTable where `Country/Region` is null").show(3)


+---+---------------+--------------+--------------+-----------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+-----------+---------+------+---------+
+---+---------------+--------------+--------------+-----------+---------+------+---------+



In [27]:
#executa uma consulta SQL no dataframe (na verdade visão criada na linha acima)
sql = spark.sql("select * from coronaTable where `Last Update` is null").show(3)


+---+---------------+--------------+--------------+-----------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+-----------+---------+------+---------+
+---+---------------+--------------+--------------+-----------+---------+------+---------+



In [28]:
#executa uma consulta SQL no dataframe (na verdade visão criada na linha acima)
sql = spark.sql("select * from coronaTable where `Confirmed` is null").show(3)


+---+---------------+--------------+--------------+-----------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+-----------+---------+------+---------+
+---+---------------+--------------+--------------+-----------+---------+------+---------+



In [29]:
#executa uma consulta SQL no dataframe (na verdade visão criada na linha acima)
sql = spark.sql("select * from coronaTable where `Deaths` is null").show(3)


+---+---------------+--------------+--------------+-----------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+-----------+---------+------+---------+
+---+---------------+--------------+--------------+-----------+---------+------+---------+



In [30]:
#executa uma consulta SQL no dataframe (na verdade visão criada na linha acima)
sql = spark.sql("select * from coronaTable where `Recovered` is null").show(3)


+---+---------------+--------------+--------------+-----------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+-----------+---------+------+---------+
+---+---------------+--------------+--------------+-----------+---------+------+---------+



In [31]:
#executa uma consulta SQL no dataframe (na verdade visão criada na linha acima)
sql = spark.sql("select * from coronaTable where `Deaths` is null").show(3)


+---+---------------+--------------+--------------+-----------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+-----------+---------+------+---------+
+---+---------------+--------------+--------------+-----------+---------+------+---------+



In [None]:
#  preencher com valores 0 a coluna 'Deaths' quando houver valor null
df.na.fill(0,subset=['Deaths']).show(2)


In [32]:
#Aleph - dúvida de agregação do novo campo
dfGroupByMax = df.groupBy("Province/State").max()

In [33]:
# ordenar os dados 
from pyspark.sql.functions import col, desc

In [34]:
# dúvida Gabriel
dfGroupByMax.sort( col("max(Deaths)").desc() ).show(10)

+--------------+--------+--------------+-----------+--------------+
|Province/State|max(SNo)|max(Confirmed)|max(Deaths)|max(Recovered)|
+--------------+--------+--------------+-----------+--------------+
|       England|  284851|     3861901.0|   112182.0|           0.0|
|           N/A|  284714|     5605532.0|   104093.0|     4480381.0|
|     Sao Paulo|  285171|     2923367.0|    97058.0|     2588973.0|
|   Maharashtra|  285006|     4665754.0|    69615.0|     3930302.0|
|    California|  284792|     3744830.0|    62078.0|           6.0|
|      New York|  285064|     2054848.0|    52358.0|           0.0|
|         Texas|  285216|     2897110.0|    50290.0|           0.0|
|Rio de Janeiro|  285147|      747449.0|    44835.0|      694160.0|
|       Florida|  284858|     2242778.0|    35268.0|           0.0|
|  Minas Gerais|  285029|     1370202.0|    34289.0|     1256330.0|
+--------------+--------+--------------+-----------+--------------+
only showing top 10 rows



In [35]:
# import format number para formatar a quantidade de casas decimais uma coluna
from pyspark.sql.functions import format_number

In [36]:
# dúvida Gabriel
dfGroupByMax.sort( col("max(Deaths)").desc() ).select(format_number('max(Recovered)', 0).alias('gabriel') , 'Province/State').show(10)

+---------+--------------+
|  gabriel|Province/State|
+---------+--------------+
|        0|       England|
|4,480,381|           N/A|
|2,588,973|     Sao Paulo|
|3,930,302|   Maharashtra|
|        6|    California|
|        0|      New York|
|        0|         Texas|
|  694,160|Rio de Janeiro|
|        0|       Florida|
|1,256,330|  Minas Gerais|
+---------+--------------+
only showing top 10 rows



In [37]:
#Vitor - dúvida de agregação do novo campo
dfGroupByCount = df.groupBy("Country/Region").count()

In [38]:
dfGroupByCount.show(3)

+--------------+-----+
|Country/Region|count|
+--------------+-----+
|          Chad|  410|
|        Russia|28010|
|      Paraguay|  421|
+--------------+-----+
only showing top 3 rows



In [42]:
#importando a função Year
from pyspark.sql.functions import year

In [40]:
dfGroupByMax.show(3)

+--------------+--------+--------------+-----------+--------------+
|Province/State|max(SNo)|max(Confirmed)|max(Deaths)|max(Recovered)|
+--------------+--------+--------------+-----------+--------------+
|          Utah|  285254|      398012.0|     2204.0|           0.0|
|     Cajamarca|  284789|       52187.0|     1255.0|           0.0|
|       Antwerp|  284740|      127330.0|        0.0|           0.0|
+--------------+--------+--------------+-----------+--------------+
only showing top 3 rows



In [41]:
df.show(3)

+---+---------------+--------------+--------------+---------------+---------+------+---------+
|SNo|ObservationDate|Province/State|Country/Region|    Last Update|Confirmed|Deaths|Recovered|
+---+---------------+--------------+--------------+---------------+---------+------+---------+
|  1|     01/22/2020|         Anhui|Mainland China|1/22/2020 17:00|      1.0|   0.0|      0.0|
|  2|     01/22/2020|       Beijing|Mainland China|1/22/2020 17:00|     14.0|   0.0|      0.0|
|  3|     01/22/2020|     Chongqing|Mainland China|1/22/2020 17:00|      6.0|   0.0|      0.0|
+---+---------------+--------------+--------------+---------------+---------+------+---------+
only showing top 3 rows



In [45]:
df.select ( year ("ObservationDate" ) ).show(3)

+---------------------+
|year(ObservationDate)|
+---------------------+
|                 null|
|                 null|
|                 null|
+---------------------+
only showing top 3 rows



In [46]:
#Solução da Nilane
from pyspark.sql.functions import unix_timestamp, to_date

In [64]:
# funçao que converte a data do formato 'MM/dd/yyyy' para timestamp
#strDate é um parâmetro com a data no formato 'MM/dd/yyyy' -> Ex: '01/22/2020'
# convertendo a data no formato 'MM/dd/yyyy' para unix timestamp (quantos segundos passaram do dia 1 de janeiro de 1970 até a data alvo)
# unix_timestamp( col("ObservationDate"), 'MM/dd/yyyy' ) -> retorna um objeto do tipo unix_timestamp
# to_date() espera um objeto do tipo timestamp, então devemos 'converter' de unix_timestamp para timestamp -> cast('timestamp')
#retornat a data no formato Timestamp - Ex : 2020-01-22
def convertStrToTimestamp (strDate, formatDate = 'MM/dd/yyyy' ):
    ut = unix_timestamp( strDate, formatDate )
    ts = ut.cast('timestamp')
    return to_date ( ts )
# convertStrToTimestamp( '01/01/2020', 'yyyy-dd-MM')   

In [63]:
# mostrando os dados da coluna 'ObservationDate', já convertidos em ano
df.select ( year( convertStrToTimestamp( col('ObservationDate')  ) ) ).show(3)

+-----------------------------------------------------------------------------+
|year(to_date(CAST(unix_timestamp(ObservationDate, MM/dd/yyyy) AS TIMESTAMP)))|
+-----------------------------------------------------------------------------+
|                                                                         2020|
|                                                                         2020|
|                                                                         2020|
+-----------------------------------------------------------------------------+
only showing top 3 rows



In [68]:
# usando o dataFrame *df*, iremos adicionar uma nova coluna apenas com o valor de Ano(Year)
df = df.withColumn ( 'Year', year( convertStrToTimestamp( col('ObservationDate')  ) ) )

In [73]:
df.show(3)

+---+---------------+--------------+--------------+---------------+---------+------+---------+----+
|SNo|ObservationDate|Province/State|Country/Region|    Last Update|Confirmed|Deaths|Recovered|Year|
+---+---------------+--------------+--------------+---------------+---------+------+---------+----+
|  1|     01/22/2020|         Anhui|Mainland China|1/22/2020 17:00|      1.0|   0.0|      0.0|2020|
|  2|     01/22/2020|       Beijing|Mainland China|1/22/2020 17:00|     14.0|   0.0|      0.0|2020|
|  3|     01/22/2020|     Chongqing|Mainland China|1/22/2020 17:00|      6.0|   0.0|      0.0|2020|
+---+---------------+--------------+--------------+---------------+---------+------+---------+----+
only showing top 3 rows



In [74]:
#importando a função Month
from pyspark.sql.functions import month

In [75]:
# usando o dataFrame *df*, iremos adicionar uma nova coluna apenas com o valor de Mês(Month)
df = df.withColumn ( 'Month', month( convertStrToTimestamp( col('ObservationDate')  ) ) )

In [91]:
#agrupando por ano, mês -> retorando os valores máximos das outras colunas
dfGroupBy = df.groupBy('Year', 'Month', 'Country/Region', 'Province/State').sum()

In [94]:
#ordenando dataframe pela coluna 'max(Deaths)' de forma descendente
dfGroupBy.sort(col('Year').desc(),col('Month').desc(),  col('sum(Deaths)').desc(), col('Province/State') ).show()

+----+-----+--------------+----------------+--------+--------------+-----------+--------------+---------+----------+
|Year|Month|Country/Region|  Province/State|sum(SNo)|sum(Confirmed)|sum(Deaths)|sum(Recovered)|sum(Year)|sum(Month)|
+----+-----+--------------+----------------+--------+--------------+-----------+--------------+---------+----------+
|2021|    5|            UK|         England|  568938|     7722425.0|   224351.0|           0.0|     4042|        10|
|2021|    5|        France|             N/A|  568432|   1.1202827E7|   208100.0|      614539.0|     4042|        10|
|2021|    5|        Brazil|       Sao Paulo|  569578|     5841411.0|   193999.0|     5141626.0|     4042|        10|
|2021|    5|          Iran|             N/A|  568466|     5051012.0|   144574.0|     3959229.0|     4042|        10|
|2021|    5|         India|     Maharashtra|  569248|     9331508.0|   139230.0|     7860604.0|     4042|        10|
|2021|    5|        Poland|             N/A|  568568|     560185