In [3]:
import $file.common
import spark._
import common._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import spark.implicits._
import spark.sqlContext.implicits._

[32mimport [39m[36m$file.$     
[39m
[32mimport [39m[36mspark._
[39m
[32mimport [39m[36mcommon._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36morg.apache.spark.sql.types.{IntegerType, StringType, StructType}
[39m
[32mimport [39m[36mspark.implicits._
[39m
[32mimport [39m[36mspark.sqlContext.implicits._[39m

In [4]:
import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._

[32mimport [39m[36mplotly._
[39m
[32mimport [39m[36mplotly.element._
[39m
[32mimport [39m[36mplotly.layout._
[39m
[32mimport [39m[36mplotly.Almond._[39m

## Consulta con DataSet

In [5]:
def infectionDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")
.withColumnRenamed("countriesAndTerritories","Country")
.as[(String,String,String,String,Double,Double,String,String,String,String,String,String)]

defined [32mfunction[39m [36minfectionDS[39m

In [6]:
def populationDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")
.as[(String,Float,String,Float,Float,Float,Double,String,String,String,String)]

defined [32mfunction[39m [36mpopulationDS[39m

### Media diaria de infecciones por número de habitante

In [7]:
def meanInfectionPerPopulationDS = 
infectionDS.join(populationDS, "Country")
        .select($"Country",
                $"dateRep" as "date",
                $"cases",
                $"Land Area (Km\u00b2)",
               $"cases" / $"Population" as "infection Per Population")
        .groupBy("country")
        .avg("infection Per Population")
        .orderBy(desc("avg(infection Per Population)"))
        .as[(String,Double)]

defined [32mfunction[39m [36mmeanInfectionPerPopulationDS[39m

### Porcentaje diario de infectados

In [8]:
def diaryInfectionPerPopulationDS = 
infectionDS.join(populationDS, "Country")
        .select($"Country",
                $"dateRep" as "date",
                $"cases",
                $"Land Area (Km\u00b2)",
               $"cases" / $"Population" as "infection Per Population")
        .orderBy($"date")
        .as[(String,String,Int,String,Double)]

defined [32mfunction[39m [36mdiaryInfectionPerPopulationDS[39m

## Consulta con DataFrame

In [9]:
def dfCovid = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")

defined [32mfunction[39m [36mdfCovid[39m

In [10]:
def dfPopulation = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")

defined [32mfunction[39m [36mdfPopulation[39m

Modifico los datos de entrada para que el formato fecha se adecue al TimeStamp de Spark

In [11]:
def dfCovidClean = dfCovid
    .select($"*",$"dateRep",translate($"dateRep","/","-").as("date"))
    .drop("dateRep")

defined [32mfunction[39m [36mdfCovidClean[39m

In [12]:
def dfCovidDate = dfCovidClean
    .select($"*",col("date"),to_date(col("date"),"dd-MM-yyyy").as("to_date"))

defined [32mfunction[39m [36mdfCovidDate[39m

### Media diaria de infecciones por número de habitante

In [13]:
def infectionsPerPopulation = dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .groupBy("country")
        .avg("infection Per Population")
        .orderBy(desc("avg(infection Per Population)"))

defined [32mfunction[39m [36minfectionsPerPopulation[39m

### Porcentaje diario de infectados

In [14]:
def diaryInfectionsDF =
dfCovidDate.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .select($"country",
                $"to_date",
                $"day",
                $"month",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .orderBy($"to_date".asc)

defined [32mfunction[39m [36mdiaryInfectionsDF[39m

# Visualización de eficiencia

In [15]:
val (x, y) = Seq(
    "DataSet " -> runWithOutput(meanInfectionPerPopulationDS.collect),
    "DataSet diary" -> runWithOutput(diaryInfectionPerPopulationDS.collect),    
    "DataFrame" -> runWithOutput(infectionsPerPopulation.collect),
    "DataFrame diary" -> runWithOutput(diaryInfectionsDF.collect),
).unzip

Bar(x, y).plot()

Took 10231


Took 2886


Took 3236


Took 2788


[36mx[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"DataSet "[39m,
  [32m"DataSet diary"[39m,
  [32m"DataFrame"[39m,
  [32m"DataFrame diary"[39m
)
[36my[39m: [32mSeq[39m[[32mInt[39m] = [33mList[39m([32m10230[39m, [32m2886[39m, [32m3236[39m, [32m2788[39m)
[36mres14_1[39m: [32mString[39m = [32m"plot-a1560895-edec-498d-a775-c83e03a31963"[39m

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(meanInfectionPerPopulationDS.collect)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(diaryInfectionPerPopulationDS.collect)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    infectionsPerPopulation.collect
    )

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    diaryInfectionsDF.collect
    )

# Visualización de datos con plotly

## media de infecciones por densidad de población

In [None]:
def (x,y) = infectionsPerPopulation.collect.map(r=>(r(0).toString, r(1).toString)).toList.unzip
Bar(x, y).plot()

## porcentaje diario de infectados en España

In [None]:
def (x,y) = diaryInfectionsDF.filter($"country" === "Spain").collect.map(r=>(r(1).toString, r(6).toString)).toList.unzip
Bar(x, y).plot()

## comparacion entre paises de crecimiento de la enfermedad

In [None]:
val y = diaryInfectionsDF.filter($"country" === "Spain").select($"infection Per Population").
    collect.map(r => r(0).toString.toDouble).toList

val x = diaryInfectionsDF.filter($"country" === "Spain").select($"to_date").collect.toList.map(_.toString)

val y1 = diaryInfectionsDF.filter($"country" === "Italy").select($"infection Per Population").
    collect.map(r => r(0).toString.toDouble).toList
val x1 = diaryInfectionsDF.filter($"country" === "Italy").select($"to_date").collect.toList.map(_.toString)

val data = Seq(
    Scatter(x,y).withName("Spain"),
    Scatter(x1,y1,mode = ScatterMode(ScatterMode.Lines),
  line = Line(color = Color.StringColor("#7F7F7F"))).withName("Italy")
).map(_.withFill(Fill.ToNextY).withStackgroup("A"))

plot(data)