In [None]:
import $file.common
import spark._
import common._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import spark.implicits._
import spark.sqlContext.implicits._
import org.apache.spark.sql.functions.{col, to_date}
import org.apache.spark.sql.types.DateType

In [None]:
import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._

## Consulta con DataSet

In [None]:
def infectionsDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covid_19_data.csv")
.as[(Int,String,String,String,String,Double,Double,Double)]

In [None]:
def populationDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")
.as[(String,Float,String,Float,Float,Float,Double,String,String,String,String)]

In [None]:
def vaccinationsDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/country_vaccinations.csv")
.as[(String,String,java.sql.Timestamp,Double,Double,Double,Double,Double,Double,Double,Double,Double,String,String,String)]

modifico los datos de entrada para que se ajuste la fecha

In [None]:
val vaccinationsClean = vaccinationsDS
    .select($"*",col("date"),to_date(col("date"),"MM-dd-yyyy")
            .as("dateVaccinated"))
    .drop("date")

In [None]:
val dateInfectionsDS = infectionsDS
    .select($"*",$"ObservationDate",translate($"ObservationDate","/","-")
            .as("date1"))
    .drop("ObservationDate")
    .select($"*",col("date1"),to_date(col("date1"),"MM-dd-yyyy")
            .as("date"))
    .drop("date1")
    .as[(Int,String,String,String,Double,Double,Double,java.sql.Timestamp)]

In [None]:
def tripleJoin = dateInfectionsDS.join(
    vaccinationsClean,$"date" === $"dateVaccinated"
    && dateInfectionsDS("Country/Region") <=> vaccinationsClean("country")
).join(populationDS, "country")

In [None]:
val megaDS = dateInfectionsDS.join(
    vaccinationsClean,$"date" === $"dateVaccinated"
    && dateInfectionsDS("Country/Region") <=> vaccinationsClean("country")
).join(populationDS,"country")
        .na.fill(0)
        .select($"country",
                $"date",
                $"confirmed",
                $"people_vaccinated",
                $"Population",
                $"confirmed" / $"Population" as "infection Per Population",
                $"people_vaccinated"/ $"Population" as "vaccination Per Population",
                $"people_vaccinated" / $"confirmed" as "infection-vaccination rate")
        .orderBy($"date".asc)
        .withColumn("infection-vaccination rate", round($"infection-vaccination rate",8))
        .withColumn("vaccination Per Population", round($"vaccination Per Population",8))
//        .as[(String,java.sql.Timestamp,Double,Double,Int,Double,Double,Double)]

## Consulta con DataFrame

In [None]:
def infectionsDF = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covid_19_data.csv")

In [None]:
def populationDF = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")

In [None]:
def vaccinationsDF = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/country_vaccinations.csv")

In [None]:
val vaccinationsClean = vaccinationsDF
    .select($"*",col("date"),to_date(col("date"),"MM-dd-yyyy")
            .as("dateVaccinated"))
    .drop("date")

In [None]:
val dateInfectionsDS = infectionsDS
    .select($"*",$"ObservationDate",translate($"ObservationDate","/","-")
            .as("date1"))
    .drop("ObservationDate")
    .select($"*",col("date1"),to_date(col("date1"),"MM-dd-yyyy")
            .as("date"))
    .drop("date1")

In [None]:
def megaDF = dateInfectionsDS.join(
    vaccinationsClean,$"date" === $"dateVaccinated"
    && dateInfectionsDS("Country/Region") <=> vaccinationsClean("country")
).join(populationDF,"country")
        .select($"country",
                $"date",
                $"confirmed",
                $"people_vaccinated",
                $"Population",
                $"confirmed" / $"Population" as "infection Per Population",
                $"people_vaccinated"/ $"Population" as "vaccination Per Population",
                $"people_vaccinated" / $"confirmed" as "infection-vaccination rate")
        .orderBy($"date".asc)
        .na.fill(0)
        .withColumn("infection-vaccination rate", round($"infection-vaccination rate",8))
        .withColumn("vaccination Per Population", round($"vaccination Per Population",8))

# Visualización de eficiencia

In [None]:
val (x, y) = Seq(
    "DataSet" -> runWithOutput(megaDS.collect),
    "DataFrame" -> runWithOutput(megaDF.collect),
).unzip

Bar(x, y).plot()

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    megaDS.collect
    )

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
megaDF.collect
)

# Visualización de datos con plotly

## crecimiento de la vacunacion con respecto a la densidad población

In [None]:
val y = megaDF.filter($"country" === "Chile").select($"vaccination Per Population" * 10000000).
    collect.map(r => r(0).toString.toDouble).toList

val x = megaDF.filter($"country" === "Chile").select($"date").collect.toList.map(_.toString)

val y1 = megaDF.filter($"country" === "Chile").select($"people_vaccinated").
    collect.map(r => r(0).toString.toDouble).toList
val x1 = megaDF.filter($"country" === "Chile").select($"date").collect.toList.map(_.toString)

val data = Seq(
    Scatter(x,y).withName("% population"),
    Scatter(x1,y1).withName("Vaccines administrated")
).map(_.withFill(Fill.ToNextY).withStackgroup("A"))

val myLayout =
  Layout()
    .withTitle("CHILE")

plot(data,myLayout)