In [None]:
import $file.common
import spark._
import common._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import spark.implicits._
import spark.sqlContext.implicits._
import org.apache.spark.sql.functions.{col, to_date}
import org.apache.spark.sql.types.DateType

In [None]:
import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", 8)

# Query 5: Tasa de infecciones frente a vacunaciones

Para obtener esta consulta se van a utilizar 3 datasets diferentes, en primer lugar un dataset de infecciones de Covid19 actualizado a las fechas de vacunaciones. El dataset de datos demográficos con el que ya veníamos trabajando y por último un dataset con el número de vacunaciones diarias por país.

El objetivo de esta query es limpiar los datos y ajustar los tipos fecha para:
    - Obtener la tasa de infecciones por densidad de población (como hemos hecho anteriormente).
    - Obtener la tasa de vacunaciones por densidad de población.
    - Obtener la tasa de vacunaciones frente a infecciones.
    
De esta forma podremos observar que crece más rápido, si las vacunas o las infecciones, y que porcentaje de la población está pendiente de ser vacunada.

## 1. Consulta con DataSet

modifico los datos de entrada para que se ajuste la fecha

In [None]:
def infectionsDS = spark.read
    .option("header", "true")
    .option("charset", "UTF8")
    .option("delimiter",",")
    .option("inferSchema", "true")
    .csv("../datasets/covid_19_data.csv")
    .withColumn("ObservationDate",translate($"ObservationDate","/","-"))
    .withColumn("ObservationDate",to_date(col("ObservationDate"),"MM-dd-yyyy"))
    .withColumnRenamed("ObservationDate","date")
    .as[(Int,String,String,String,String,Double,Double,Double)]

In [None]:
def populationDS = spark.read
    .option("header", "true")
    .option("charset", "UTF8")
    .option("delimiter",",")
    .option("inferSchema", "true")
    .csv("../datasets/population_by_country_2020.csv")
    .withColumnRenamed("Country (or dependency)","Country")
    .withColumnRenamed("Population (2020)","Population")
    .as[(String,Float,String,Float,Float,Float,Double,String,String,String,String)]

In [None]:
def vaccinationsDS = spark.read
    .option("header", "true")
    .option("charset", "UTF8")
    .option("delimiter",",")
    .option("inferSchema", "true")
    .csv("../datasets/country_vaccinations.csv")
    .withColumn("date",to_date(col("date"),"MM-dd-yyyy"))
    .withColumnRenamed("date","dateVaccinated")
    .as[(String,String,java.sql.Timestamp,Double,Double,Double,Double,Double,Double,Double,Double,Double,String,String,String)]

In [None]:
def megaDS = infectionsDS
    .join(
        vaccinationsDS,$"date" === $"dateVaccinated"
        && $"Country/Region" === $"country"
    ).join(populationDS,"Country")
        .na.fill(0)
        .select($"country",
                $"date",
                $"confirmed",
                $"people_vaccinated",
                $"Population",
                $"confirmed" / $"Population" as "infection Per Population",
                $"people_vaccinated"/ $"Population" as "vaccination Per Population",
                $"people_vaccinated" / $"confirmed" as "infection-vaccination rate")
        .orderBy($"date".asc)
        .withColumn("infection-vaccination rate", round($"infection-vaccination rate",8))
        .withColumn("vaccination Per Population", round($"vaccination Per Population",8))
        .as[(String,java.sql.Timestamp,Double,Double,Int,Double,Double,Option[Double])]

## 2. Consulta con DataFrame

In [None]:
def infectionsDF = spark.read
    .option("header", "true")
    .csv("../datasets/covid_19_data.csv")
    .withColumn("ObservationDate",translate($"ObservationDate","/","-"))
    .withColumn("ObservationDate",to_date(col("ObservationDate"),"MM-dd-yyyy"))
    .withColumnRenamed("ObservationDate","date")
    .withColumnRenamed("Country/Region","CountryInfection")

In [None]:
def populationDF = spark.read
    .option("header", "true")
    .csv("../datasets/population_by_country_2020.csv")
    .withColumnRenamed("Country (or dependency)","Country")
    .withColumnRenamed("Population (2020)","Population")

In [None]:
def vaccinationsDF = spark.read
    .option("header", "true")
    .csv("../datasets/country_vaccinations.csv")
    .withColumnRenamed("date","dateVaccinated")
    .withColumnRenamed("country","CountryVaccination")

In [None]:
def megaDF = infectionsDF
    .join(
        vaccinationsDF,$"date" === $"dateVaccinated"
        && $"CountryInfection" === $"CountryVaccination"
    ).join(populationDF,$"Country" === $"CountryInfection")
        .na.fill(0)
        .select($"CountryInfection",
                $"date",
                $"confirmed",
                $"people_vaccinated",
                $"Population",
                $"confirmed" / $"Population" as "infection Per Population",
                $"people_vaccinated"/ $"Population" as "vaccination Per Population",
                $"people_vaccinated" / $"confirmed" as "infection-vaccination rate")
        .orderBy($"date".asc)
        .withColumn("infection-vaccination rate", round($"infection-vaccination rate",8))
        .withColumn("vaccination Per Population", round($"vaccination Per Population",8))

## 3. Visualización de rendimiento

In [None]:
val (x, y) = Seq(
    "DataSet" -> runWithOutput(megaDS.collect),
    "DataFrame" -> runWithOutput(megaDF.collect),
).unzip

Bar(x, y).plot()

## 4. Comparativas de rendimiento

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    megaDS.collect
    )

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    megaDF.collect
)

## 5. Visualización de datos con plotly

### crecimiento de la vacunacion con respecto a la densidad población

In [None]:
val y = megaDF
    .filter($"country" === "Chile")
    .select($"vaccination Per Population" * 10000000)
    .collect
    .map(r => r(0).toString.toDouble)
    .toList

val x = megaDF
    .filter($"country" === "Chile")
    .select($"date")
    .collect
    .toList
    .map(_.toString)

val y1 = megaDF
    .filter($"country" === "Chile")
    .select($"people_vaccinated")
    .collect
    .map(r => r(0).toString.toDouble)
    .toList

val x1 = megaDF
    .filter($"country" === "Chile")
    .select($"date")
    .collect
    .toList
    .map(_.toString)

val data = Seq(
        Scatter(x,y).withName("Population vaccinated"),
        Scatter(x1,y1).withName("Vaccines administrated"))
    .map(_.withFill(Fill.ToNextY)
    .withStackgroup("A"))

val myLayout =
    Layout()
    .withTitle("CHILE")

plot(data,myLayout)