In [None]:
import $file.common
import spark._
import common._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import spark.implicits._
import spark.sqlContext.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark._
import org.apache.spark.sql.types._, func._
import org.apache.spark.sql.functions.{col, to_date}
import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._
import org.apache.spark.sql.types.DateType

In [None]:
val dfCovid = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")

In [None]:
val dfPopulation = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.drop("Yearly Change","Net Change")
.withColumnRenamed("Density (P/Km²)","Density")
.withColumnRenamed("Land Area (Km²)","Area")
.withColumnRenamed("Migrants (net)", "Migrants")
.withColumnRenamed("Fert. Rate", "Fertility")
.withColumnRenamed("Med. Age","Med_age")
.withColumnRenamed("Urban Pop %","urban_population")
.withColumnRenamed("World Share","World_share")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")

In [None]:
/*dfCovid.write
    .partitionBy("countriesAndTerritories")
    .parquet("data_files/covid_countries.parquet")
*/

In [None]:
/*dfPopulation.write
    .partitionBy("country")
    .parquet("data_files/covid_population.parquet")
*/

# Consulta utilizando los datos en .parquet

Para realizar estas consultas vamos a utilizar un tipo de datos columnar como es parquet. En las lineas de código comentadas previas, podemos observar como hemos particionado los datos por paises, de modo que al realizar consultar parquet puede acceder solo a las columnas que le interesa.

In [None]:
val parqDF = spark.read.parquet("../parquet_files/covid_countries.parquet")

In [None]:
val parqPopDF = spark.read.parquet("../parquet_files/covid_population.parquet")

## media de casos diarios en España

In [None]:
def parqMeanDF = parqDF.toDF
    .where("countriesAndTerritories == 'Spain'")
    .agg(mean("cases"))
    .orderBy("avg(cases)")

In [None]:
def csvMeanDF = dfCovid.toDF
    .where("countriesAndTerritories == 'Spain'")
    .agg(mean("cases"))
    .orderBy("avg(cases)")

## casos por km2 en España

In [None]:
def csvCasesKM2 =
dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .where("countriesAndTerritories == 'Spain'")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Area",
                $"cases" / $"Area" as "infection Per Km\u00b2")
        .groupBy("country")
        .avg("infection Per Km\u00b2")
        .orderBy(desc("avg(infection Per Km²)"))

In [None]:
def parquetCasesKM2 =
parqDF.join(parqPopDF, $"country" === $"countriesAndTerritories")
        .where("countriesAndTerritories == 'Spain'")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Area",
                $"cases" / $"Area" as "infection Per Km\u00b2")
        .groupBy("country")
        .avg("infection Per Km\u00b2")
        .orderBy(desc("avg(infection Per Km²)"))

## casos por densidad de población en Chile

In [None]:
def csvCasesPopulation =
parqDF.join(parqPopDF, $"country" === $"countriesAndTerritories")
        .where("countriesAndTerritories == 'Chile'")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .groupBy("country")
        .avg("infection Per Population")
        .orderBy(desc("avg(infection Per Population)"))

In [None]:
def parquetCasesPopulation =
dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .where("countriesAndTerritories == 'Chile'")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .groupBy("country")
        .avg("infection Per Population")
        .orderBy(desc("avg(infection Per Population)"))

## Porcentaje diario de infecciones

In [None]:
def csvDailyCasesRate =
dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
                .select($"country",
                $"dateRep",
                $"day",
                $"month",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .orderBy($"dateRep".asc)

In [None]:
def parquetDailyCasesRate =
parqDF.join(parqPopDF, $"country" === $"countriesAndTerritories")
                .select($"country",
                $"dateRep",
                $"day",
                $"month",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .orderBy($"dateRep".asc)

In [None]:
val (x, y) = Seq(
    "parquet Mean cases Spain" -> runWithOutput(parqMeanDF.collect),
    "csv Mean cases Spain" -> runWithOutput(csvMeanDF.collect),       
    "parquet Cases KM2 Spain" -> runWithOutput(parquetCasesKM2.collect),
    "csv Cases KM2 Spain" -> runWithOutput(csvCasesKM2.collect),
    "parquet Cases Population Chile" -> runWithOutput(parquetCasesPopulation.collect),
    "csv Cases Population Chile" -> runWithOutput(csvCasesPopulation.collect),
    "parquet Daily Cases Rate" -> runWithOutput(parquetDailyCasesRate.collect),
    "csv Daily Cases Rate" -> runWithOutput(csvDailyCasesRate.collect),
).unzip

Bar(x, y).plot()

## Medicion detallada usando el framework del CERN

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    parqMeanDF
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    csvMeanDF
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    parquetCasesKM2
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    csvCasesKM2
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    parquetCasesPopulation
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    csvCasesPopulation
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    csvDailyCasesRate
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    parquetDailyCasesRate
    .collect()
)