In [0]:
import $file.common
import spark._
import common._
import org.apache.spark.sql.functions._
import org.apache.spark.rdd._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.apache.spark.sql.functions.{col, to_date}
import spark.implicits._
import spark.sqlContext.implicits._

Cannot resolve $file import: /home/jovyan/work/common.sc

: 

In [None]:
import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._

# Media de infecciones por Km2 

### Utilizando RDDs

In [None]:
def infections(lines : RDD[String]) : RDD[Infection] =
    lines.map(line => {
      val arr = line.split(",")
      Infection(
        day = arr(1).toInt,
        month = arr(2).toInt,
        year = arr(3).toInt,
        nCases = arr(4).toInt,
        nDeaths = arr(5).toInt,
        country = arr(6),
        continent = arr(10)
      )
    })

In [None]:
def infectionRDD = infections(spark.sparkContext.textFile("../datasets/data.csv"))

In [None]:
org.apache.spark.sql.catalyst.encoders.OuterScopes.addOuterScope(this)
case class Population(
    country : String, 
    population : Int, 
    density : Int, 
    land_area: Int, 
    ) 
extends Serializable

In [None]:
def populationData = spark.sparkContext.textFile("../datasets/population_by_country_2020.csv")

In [None]:
def population(lines : RDD[String]) : RDD[Population] =
    lines.mapPartitionsWithIndex(
                   (index, it) => if (index == 0) it.drop(1) else it,
                    preservesPartitioning = true
                 )
    .map(line => {
      val arr = line.split(",")
      Population(
        country = arr(0),
        population = arr(1).toInt,
        density = arr(4).toInt,
        land_area = arr(5).toInt,
      )
    })

In [None]:
def populationRDD = population(populationData)

### Un join computacionalmente pesado desde el principio ya que cruza todos los datos sin quedarnos con los que nos interesen

Spark no me deja hacer un Join de RDD que no sean pair RDD así que tenemos que construirlo

In [None]:
// populationRDD.join(infectionRDD)

Construyo Pair RDDs conservando todos los datos

In [None]:
def populationByCountry = populationRDD.map(
    x => (x.country,x))

def infectionByCountry = 
      infectionRDD.map(x => (x.country,x))

Hago el Join y agrupo por paises

In [None]:
def joinedRDD = infectionByCountry.join(populationByCountry).groupByKey()

Finalmente calculo la media

In [None]:
joinedRDD.mapValues(
    x => x.map( 
        line => line._1.nCases.toFloat / line._2.land_area.toFloat
    )).mapValues(
    x => x.sum / x.size
)

Lo hago todo en una única operación para calcular el tiempo de ejecución

In [None]:
val notOptimizedRDD =
    infectionByCountry.join(populationByCountry)
    .groupByKey()
    .mapValues(
    x => x.map( 
        line => line._1.nCases.toFloat / line._2.land_area.toFloat)
    ).mapValues(
        x => x.sum / x.size
    )

#### Para optimizar un poco esta consulta:

Despejo solo los datos que me interesan para trabajar con Pair RDDs y optimizar la consulta

In [None]:
def countriesAndLandArea = populationRDD.map(
    x => (x.country,x.land_area))

In [None]:
def countriesAndCases = 
      infectionRDD.map(x => (x.country,x.nCases))
      .groupByKey()

Ejecuto un join y trabajo para calcular primero la media de infecciones por Km2 diaria, 
para luego calcular la media total

In [None]:
def average = countriesAndCases.join(countriesAndLandArea)

In [None]:
average.mapValues(
    x => x._1.map(
        y => (y.toFloat / x._2.toFloat)
    )).mapValues(
    x => x.sum/x.size
)

Lo hago todo en una única operación para calcular el tiempo de ejecución

In [None]:
def meanInfectionsRDD =
countriesAndCases.join(countriesAndLandArea)   
.mapValues(
    x => x._1.map(
        y => (y.toDouble / x._2.toDouble)
    )).mapValues(
    x => x.sum / x.size
)

## Consulta con DataSet

In [None]:
def infectionDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")
.withColumnRenamed("countriesAndTerritories","Country")
.as[(String,String,String,String,Double,Double,String,String,String,String,String,String)]

In [19]:
def populationDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")
.as[(String,Float,String,Float,Float,Float,Double,String,String,String,String)]

defined [32mfunction[39m [36mpopulationDS[39m

In [20]:
def meanInfectionsperKM2DS = 
infectionDS.join(populationDS, "Country")
        .select($"Country",
                $"dateRep" as "date",
                $"cases",
                $"Land Area (Km\u00b2)",
                $"cases" / $"Land Area (Km\u00b2)" as "infection Per Km\u00b2")
        .groupBy("Country")
        .agg(round(avg("infection Per Km\u00b2"),10).as[Float])
        .orderBy(desc("round(avg(infection Per Km²), 10)"))
        .as[(String,Double)]

defined [32mfunction[39m [36mmeanInfectionsperKM2DS[39m

## Consulta con DataFrame

In [21]:
def dfCovid = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")

defined [32mfunction[39m [36mdfCovid[39m

In [22]:
def dfPopulation = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")

defined [32mfunction[39m [36mdfPopulation[39m

Modifico los datos de entrada para que el formato fecha se adecue al TimeStamp de Spark

In [23]:
def dfCovidClean = dfCovid
    .select($"*",$"dateRep",translate($"dateRep","/","-").as("date"))
    .drop("dateRep")

defined [32mfunction[39m [36mdfCovidClean[39m

In [24]:
def dfCovidDate = dfCovidClean
    .select($"*",col("date"),to_date(col("date"),"dd-MM-yyyy").as("to_date"))

defined [32mfunction[39m [36mdfCovidDate[39m

Hago una consulta de prueba para obtener la media solo de los casos en España

In [25]:
def spainCovid = dfCovid
    .select("dateRep","cases")
    .where("countriesAndTerritories == 'Spain'").toDF.agg(avg("cases"))

defined [32mfunction[39m [36mspainCovid[39m

### Finalmente ejecuto la consulta de nuestro caso de uso, infecciones por Km2

In [26]:
def meanInfectionsperKM2DF = 
dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Land Area (Km\u00b2)",
                $"cases" / $"Land Area (Km\u00b2)" as "infection Per Km\u00b2")
        .groupBy("country")
        .avg("infection Per Km\u00b2")
        .orderBy(desc("avg(infection Per Km²)"))

defined [32mfunction[39m [36mmeanInfectionsperKM2DF[39m

# Visualización de eficiencia

In [29]:
val (x, y) = Seq(
    "Not Optimized RDD" -> runWithOutput(notOptimizedRDD.collect()),
    "RDD" -> runWithOutput(meanInfectionsRDD.collect()),
    "DataSet" -> runWithOutput(meanInfectionsperKM2DS.collect),
    "DataFrame" -> runWithOutput(meanInfectionsperKM2DF.collect)
).unzip

Bar(x, y).plot()

Took 1493


Took 637


Took 2574


Took 2468


[36mx[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m([32m"Not Optimized RDD"[39m, [32m"RDD"[39m, [32m"DataSet"[39m, [32m"DataFrame"[39m)
[36my[39m: [32mSeq[39m[[32mInt[39m] = [33mList[39m([32m1493[39m, [32m637[39m, [32m2574[39m, [32m2468[39m)
[36mres28_1[39m: [32mString[39m = [32m"plot-d81ee546-c799-481c-8bb1-82f4fd1d5cfd"[39m

In [None]:
spark.time(notOptimizedRDD.collect())

In [None]:
spark.time(meanInfectionsRDD.collect())

In [None]:
spark.time(meanInfectionsperKM2DS.collect())

In [None]:
spark.time(meanInfectionsperKM2DF.collect())

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(notOptimizedRDD.collect)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(meanInfectionRDD.collect)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(meanInfectionsperKM2DS.collect)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    meanInfectionsperKM2DF.collect
    )

# Visualización de datos con plotly

In [30]:
val (x,y) = meanInfectionsperKM2DF.collect.map(r=>(r(0).toString, r(1).toString.toFloat)).toList.unzip
Bar(x, y).plot()

[36mx[39m: [32mList[39m[[32mString[39m] = [33mList[39m(
  [32m"Monaco"[39m,
  [32m"Gibraltar"[39m,
  [32m"Bahrain"[39m,
  [32m"Singapore"[39m,
  [32m"Maldives"[39m,
  [32m"Malta"[39m,
  [32m"Aruba"[39m,
  [32m"Belgium"[39m,
  [32m"Andorra"[39m,
  [32m"Netherlands"[39m,
  [32m"Guam"[39m,
  [32m"Israel"[39m,
  [32m"Luxembourg"[39m,
  [32m"Lebanon"[39m,
  [32m"Qatar"[39m,
  [32m"Liechtenstein"[39m,
  [32m"Bermuda"[39m,
  [32m"Cura\u00e7ao"[39m,
  [32m"Switzerland"[39m,
  [32m"Kuwait"[39m,
  [32m"Italy"[39m,
  [32m"Slovenia"[39m,
  [32m"Armenia"[39m,
  [32m"Moldova"[39m,
  [32m"Bangladesh"[39m,
  [32m"Portugal"[39m,
  [32m"Poland"[39m,
  [32m"France"[39m,
  [32m"Montenegro"[39m,
  [32m"Austria"[39m,
  [32m"Hungary"[39m,
  [32m"Germany"[39m,
  [32m"Serbia"[39m,
  [32m"Jordan"[39m,
  [32m"Spain"[39m,
  [32m"Slovakia"[39m,
  [32m"India"[39m,
  [32m"Panama"[39m,
...
[36my[39m: [32mList[39m[[32mFloat[39m] 