In [23]:
#create sparksession object
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('log_reg').getOrCreate()

#Load the dataset
df=spark.read.csv('covid.csv',inferSchema=True,header=True)

In [24]:
df.show()

+----------+---+-----+----+-----+------+-----------------------+-----+--------------------+-----------+------------+----------------------------------------------------------+
|   dateRep|day|month|year|cases|deaths|countriesAndTerritories|geoId|countryterritoryCode|popData2019|continentExp|Cumulative_number_for_14_days_of_COVID-19_cases_per_100000|
+----------+---+-----+----+-----+------+-----------------------+-----+--------------------+-----------+------------+----------------------------------------------------------+
|2020-08-22| 22|    8|2020|   38|     0|            Afghanistan|   AF|                 AFG|3.8041757E7|        Asia|                                                2.31061883|
|2020-08-21| 21|    8|2020|   97|     2|            Afghanistan|   AF|                 AFG|3.8041757E7|        Asia|                                                2.41576644|
|2020-08-20| 20|    8|2020|  160|     8|            Afghanistan|   AF|                 AFG|3.8041757E7|        Asia|    

In [32]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
  .setInputCol("dateRep")\
  .setOutputCol("day_of_week_index").fit(df)

df2 = indexer.transform(df)


In [35]:

# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder()\
  .setInputCol("day_of_week_index")\
  .setOutputCol("day_of_week_encoded")

df3 = encoder.fit(df2)
df3=df3.transform(df2)
df4=df3.select('dateRep','day_of_week_index','day_of_week_encoded')
df4.show()



+----------+-----------------+-------------------+
|   dateRep|day_of_week_index|day_of_week_encoded|
+----------+-----------------+-------------------+
|2020-08-22|            112.0|  (235,[112],[1.0])|
|2020-08-21|             96.0|   (235,[96],[1.0])|
|2020-08-20|             95.0|   (235,[95],[1.0])|
|2020-08-19|             94.0|   (235,[94],[1.0])|
|2020-08-18|             93.0|   (235,[93],[1.0])|
|2020-08-17|            111.0|  (235,[111],[1.0])|
|2020-08-16|             92.0|   (235,[92],[1.0])|
|2020-08-15|             91.0|   (235,[91],[1.0])|
|2020-08-14|             90.0|   (235,[90],[1.0])|
|2020-08-13|             89.0|   (235,[89],[1.0])|
|2020-08-12|             88.0|   (235,[88],[1.0])|
|2020-08-11|             87.0|   (235,[87],[1.0])|
|2020-08-10|             86.0|   (235,[86],[1.0])|
|2020-08-09|             85.0|   (235,[85],[1.0])|
|2020-08-08|             84.0|   (235,[84],[1.0])|
|2020-08-07|             83.0|   (235,[83],[1.0])|
|2020-08-06|             82.0| 

In [7]:
import pyspark.sql.functions as f
df=df.where(f.col("geoId").isin({"FR", "IT", "ES"})).select('geoId','month','cases','deaths')
df=df.orderBy("geoId", ascending=True)
df.show(5)

+-----+-----+-----+------+
|geoId|month|cases|deaths|
+-----+-----+-----+------+
|   ES|    8| 8148|    25|
|   ES|    8|    0|     0|
|   ES|    8| 7039|    16|
|   ES|    8| 6671|   127|
|   ES|    8| 5114|    24|
+-----+-----+-----+------+
only showing top 5 rows



In [22]:
from pyspark.sql.functions import desc

df.groupBy(["geoId","month"]).sum("deaths").withColumnRenamed("sum(deaths)", "deaths_total")\
    .sort(("month")).limit(5).show()

+-----+-----+------------+
|geoId|month|deaths_total|
+-----+-----+------------+
|   ES|    1|           0|
|   FR|    1|           0|
|   IT|    1|           0|
|   FR|    2|           2|
|   ES|    2|           0|
+-----+-----+------------+

