In [77]:
import $file.common
import spark._
import common._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.apache.spark.sql.types._, func._

[32mimport [39m[36m$file.$     
[39m
[32mimport [39m[36mspark._
[39m
[32mimport [39m[36mcommon._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36morg.apache.spark.sql.types.{IntegerType, StringType, StructType}
[39m
[32mimport [39m[36morg.apache.spark.sql.types._, func._[39m

In [None]:
import $ivy.`org.plotly-scala::plotly-almond:0.8.1`

import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._

## RDD para la consulta de infecciones diarias por país

In [75]:
val infectionData = spark.sparkContext.textFile("../datasets/data.csv")

[36minfectionData[39m: [32mRDD[39m[[32mString[39m] = ../datasets/data.csv MapPartitionsRDD[165] at textFile at cmd74.sc:1

Creo una funcion para trabajar con un RDD de infecciones

In [74]:
def infections(lines : RDD[String]) : RDD[Infection] =
    lines.map(line => {
      val arr = line.split(",")
      Infection(
        day = arr(1).toInt,
        month = arr(2).toInt,
        year = arr(3).toInt,
        nCases = arr(4).toInt,
        nDeaths = arr(5).toInt,
        country = arr(6),
        continent = arr(10)
      )
    })

defined [32mfunction[39m [36minfections[39m

Calculo la media de infecciones diarias por país trabajando con pair RDD

In [None]:
  def infectionGrowthAverage(infections : RDD[Infection]) : RDD[(String, Int)]= {

    val countriesAndCases : RDD[(String, Iterable[Int])] = 
      infections.map(x => (x.country,x.nCases))
      .groupByKey()
      
    countriesAndCases.mapValues(x => (x.sum / x.size)).sortBy(_._2)
  }

Muestro el resultado y el tiempo de ejecución

In [None]:
val infectionRDD = infections(infectionData)
val infectionAvgRDD = infectionGrowthAverage(infectionRDD)

Usando la API de spark

In [None]:
val timeRDD = spark.time(infectionAvgRDD.collect())

o bien el framework del cern que nos da más información

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(infectionAvgRDD.collect())

## Hago los mismos calculos con un DataFrame

Convierto el RDD obtenido previamente en un DataFrame para inferir la clase infección

In [None]:
val infectionDF = spark.createDataFrame(infectionRDD)

Utilizo los métodos de la API DF que incluye uno optimizado para calcular la media.

Ejecuto y comprabamos el tiempo de ejecución.

In [None]:
val infAvgOrDf = infectionDF.
    groupBy("country")
    .avg("nCases")
    .orderBy(desc("avg(nCases)"))

In [None]:
val timeDF = spark.time(infAvgOrDf.collect)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(infAvgOrDf.collect)

#### Otra opción es crear el DataFrame directamente importando los datos pero deja de ser un DF de infecciones

In [None]:
val dfCovid = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")

In [None]:
dfCovid.schema
dfCovid.explain

In [None]:
val dfCovidWithSchema = dfCovid.toDF
    .groupBy("countriesAndTerritories")
    .agg(mean("cases"))
    .orderBy("avg(cases)")

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(dfCovidWithSchema.collect)

#### O puedo definir el esquema manualmente para crear el DataFrame

In [None]:
val schema = new StructType()
    .add("dateRep",StringType,true)
    .add("day",IntegerType,true)
    .add("month",IntegerType,true)
    .add("year",IntegerType,true)
    .add("cases",IntegerType,true)
    .add("deaths",IntegerType,true)
    .add("countriesAndTerritories",StringType,true)
    .add("geoId",StringType,true)
    .add("countryterritoryCode",StringType,true)
    .add("popData2018",IntegerType,true)
    .add("continentExp",StringType,true)

In [None]:
val df = spark.read
.format("csv")
.option("header","true")
.schema(schema)
.load("../datasets/data.csv")

In [None]:
df.printSchema

# Y con un DataSet

In [None]:
val infectionDS = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.csv("../datasets/covidworldwide.csv")
.as[(String,String,String,String,String,String,String,String,String,String,String,String)]

In [None]:
val avgDS = 
    infectionDS.groupBy($"countriesAndTerritories")
    .agg(avg($"cases"))
    .orderBy("avg(cases)")
    .as[(String,Double)]

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(avgDS.collect)

### Trabajamos con Dataset[Infection]

In [None]:
val infectionDataset = spark.createDataset(infectionRDD)

In [None]:
val avgInfectionDS = infectionDataset
    .groupBy($"country")
    .agg(avg($"nCases").as[Double])
    .orderBy("avg(nCases)")
    .as[(String,Double)]

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(avgInfectionDS.collect)

# Visualización de datos

In [None]:
val (x,y) = infAvgOrDf.collect.map(r=>(r(0).toString, r(1).toString.toDouble)).toList.unzip
Bar(x, y).plot()

# Visualización de eficiencia

In [None]:
val (x, y) = Seq(
    "RDD" -> runWithOutput(infectionAvgRDD.collect),
    "DataSet" -> runWithOutput(avgDS.collect),
    "DataFrame" -> runWithOutput(infAvgOrDf.collect)
).unzip

Bar(x, y).plot()