In [None]:
import $file.common
import spark._
import common._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

In [None]:
import spark.implicits._
import spark.sqlContext.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark._
import org.apache.spark.sql.types._, func._
import org.apache.spark.sql.functions.{col, to_date}
import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._

## Consulta con DataSet

In [None]:
val infectionDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")
.withColumnRenamed("countriesAndTerritories","Country")
.as[(String,String,String,String,Double,Double,String,String,String,String,String,String)]

In [None]:
val populationDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")
.as[(String,Float,String,Float,Float,Float,Double,String,String,String,String)]

### Media diaria de infecciones por número de habitante

In [None]:
val meanInfectionPerPopulationDS = 
infectionDS.join(populationDS, "Country")
        .select($"Country",
                $"dateRep" as "date",
                $"cases",
                $"Land Area (Km\u00b2)",
               $"cases" / $"Population" as "infection Per Population")
        .groupBy("country")
        .avg("infection Per Population")
        .orderBy(desc("avg(infection Per Population)"))
        .as[(String,Double)]

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(meanInfectionPerPopulationDS.collect)

### Porcentaje diario de infectados

In [None]:
val diaryInfectionPerPopulationDS = 
infectionDS.join(populationDS, "Country")
        .select($"Country",
                $"dateRep" as "date",
                $"cases",
                $"Land Area (Km\u00b2)",
               $"cases" / $"Population" as "infection Per Population")
        .orderBy($"date")
        .as[(String,String,Int,String,Double)]

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(diaryInfectionPerPopulationDS.collect)

## Consulta con DataFrame

In [None]:
val dfCovid = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")

In [None]:
val dfPopulation = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")
dfPopulation.showHTML()
dfPopulation.schema

Modifico los datos de entrada para que el formato fecha se adecue al TimeStamp de Spark

In [None]:
val dfCovidClean = dfCovid
    .select($"*",$"dateRep",translate($"dateRep","/","-").as("date"))
    .drop("dateRep")

In [None]:
val dfCovidDate = dfCovidClean
    .select($"*",col("date"),to_date(col("date"),"dd-MM-yyyy").as("to_date"))

### Media diaria de infecciones por número de habitante

In [None]:
val infectionsPerPopulation = dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .groupBy("country")
        .avg("infection Per Population")
        .orderBy(desc("avg(infection Per Population)"))

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    infectionsPerPopulation.collect
    )

### Porcentaje diario de infectados

In [None]:
val diaryInfectionsDF =
dfCovidDate.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .select($"country",
                $"to_date",
                $"day",
                $"month",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .orderBy($"to_date".asc)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    diaryInfectionsDF.collect
    )

# Visualización de datos con plotly

## media de infecciones por densidad de población

In [None]:
val (x,y) = infectionsPerPopulation.collect.map(r=>(r(0).toString, r(1).toString)).toList.unzip
Bar(x, y).plot()

## porcentaje diario de infectados en España

In [None]:
val (x,y) = diaryInfectionsDF.filter($"country" === "Spain").collect.map(r=>(r(1).toString, r(6).toString)).toList.unzip
Bar(x, y).plot()

## comparacion entre paises de crecimiento de la enfermedad

In [None]:
val y = diaryInfectionsDF.filter($"country" === "Spain").select($"infection Per Population").
    collect.map(r => r(0).toString.toDouble).toList

val x = diaryInfectionsDF.filter($"country" === "Spain").select($"to_date").collect.toList.map(_.toString)

val y1 = diaryInfectionsDF.filter($"country" === "Italy").select($"infection Per Population").
    collect.map(r => r(0).toString.toDouble).toList
val x1 = diaryInfectionsDF.filter($"country" === "Italy").select($"to_date").collect.toList.map(_.toString)

val data = Seq(
    Scatter(x,y).withName("Spain"),
    Scatter(x1,y1,mode = ScatterMode(ScatterMode.Lines),
  line = Line(color = Color.StringColor("#7F7F7F"))).withName("Italy")
).map(_.withFill(Fill.ToNextY).withStackgroup("A"))

plot(data)

# Visualización de eficiencia

In [None]:
val (x, y) = Seq(
    "DataSet" -> runWithOutput(meanInfectionPerPopulationDS.collect),
    "DataSet2" -> runWithOutput(diaryInfectionPerPopulationDS.collect),    
    "DataFrame" -> runWithOutput(infectionsPerPopulation.collect),
    "DataFrame2" -> runWithOutput(diaryInfectionsDF.collect),
).unzip

Bar(x, y).plot()