In [11]:
import $file.common
import spark._
import common._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import spark.implicits._
import spark.sqlContext.implicits._
import org.apache.spark.sql._
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark._
import org.apache.spark.sql.types._, func._
import org.apache.spark.sql.functions.{col, to_date}
import plotly._
import plotly.element._
import plotly.layout._
import plotly.Almond._
import org.apache.spark.sql.types.DateType

[32mimport [39m[36m$file.$     
[39m
[32mimport [39m[36mspark._
[39m
[32mimport [39m[36mcommon._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36morg.apache.spark.sql.types.{IntegerType, StringType, StructType}
[39m
[32mimport [39m[36mspark.implicits._
[39m
[32mimport [39m[36mspark.sqlContext.implicits._
[39m
[32mimport [39m[36morg.apache.spark.sql._
[39m
[32mimport [39m[36morg.apache.spark.sql.{functions => func, _}
[39m
[32mimport [39m[36morg.apache.spark.sql.types._
[39m
[32mimport [39m[36morg.apache.spark.rdd.RDD
[39m
[32mimport [39m[36morg.apache.spark.sql.SparkSession
[39m
[32mimport [39m[36morg.apache.spark.sql.types.{IntegerType, StringType, StructType}
[39m
[32mimport [39m[36morg.apache.spark.{SparkConf, SparkContext}
[39m
[32mimport [39m[36morg.apache.spark._
[39m
[32mimport [39m[36morg.apache.spark.sql.types._, func._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions.{col,

In [12]:
def dfCovid = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/covidworldwide.csv")

defined [32mfunction[39m [36mdfCovid[39m

In [13]:
def dfPopulation = spark.read
.option("header", "true")
.option("charset", "UTF8")
.option("delimiter",",")
.option("inferSchema", "true")
.csv("../datasets/population_by_country_2020.csv")
.drop("Yearly Change","Net Change")
.withColumnRenamed("Density (P/Km²)","Density")
.withColumnRenamed("Land Area (Km²)","Area")
.withColumnRenamed("Migrants (net)", "Migrants")
.withColumnRenamed("Fert. Rate", "Fertility")
.withColumnRenamed("Med. Age","Med_age")
.withColumnRenamed("Urban Pop %","urban_population")
.withColumnRenamed("World Share","World_share")
.withColumnRenamed("Country (or dependency)","Country")
.withColumnRenamed("Population (2020)","Population")

defined [32mfunction[39m [36mdfPopulation[39m

# Querys utilizando los datos en .parquet

Para realizar estas consultas vamos a utilizar un tipo de datos columnar como es parquet. En las lineas de código comentadas previas, podemos observar como hemos particionado los datos por paises, de modo que al realizar consultar parquet puede acceder solo a las columnas que le interesa.

### Creacion de datos en .parquet

Para crear los datos: solo en la primera ejecución

In [13]:
/*dfCovid.write
    .partitionBy("countriesAndTerritories")
    .parquet("data_files/covid_countries.parquet")
*/

In [13]:
/*dfPopulation.write
    .partitionBy("country")
    .parquet("data_files/covid_population.parquet")
*/

Para construir los DataFrames

In [14]:
val parqDF = spark.read.parquet("../parquet_files/covid_countries.parquet")

[36mparqDF[39m: [32mDataFrame[39m = [dateRep: string, day: int ... 10 more fields]

In [15]:
val parqPopDF = spark.read.parquet("../parquet_files/covid_population.parquet")

[36mparqPopDF[39m: [32mDataFrame[39m = [Population: int, Density: int ... 7 more fields]

Observamos como los dataframes obtenidos comparten el mismo esquema, sin embargo el dataframe de parquet tiene la partición visible en el esquema

In [16]:
def schemaDF = dfCovid.schema

defined [32mfunction[39m [36mschemaDF[39m

In [17]:
def schemaPARQUET = parqDF.schema

defined [32mfunction[39m [36mschemaPARQUET[39m

## 1. Query 1: media de casos diarios en España

In [18]:
def parqMeanDF = parqDF.toDF
    .where("countriesAndTerritories == 'Spain'")
    .agg(mean("cases"))
    .orderBy("avg(cases)")

defined [32mfunction[39m [36mparqMeanDF[39m

In [19]:
def csvMeanDF = dfCovid.toDF
    .where("countriesAndTerritories == 'Spain'")
    .agg(mean("cases"))
    .orderBy("avg(cases)")

defined [32mfunction[39m [36mcsvMeanDF[39m

## 2. Query 2: casos por km2 en España

In [20]:
def csvCasesKM2 =
dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .where("countriesAndTerritories == 'Spain'")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Area",
                $"cases" / $"Area" as "infection Per Km\u00b2")
        .groupBy("country")
        .avg("infection Per Km\u00b2")
        .orderBy(desc("avg(infection Per Km²)"))

defined [32mfunction[39m [36mcsvCasesKM2[39m

In [21]:
def parquetCasesKM2 =
parqDF.join(parqPopDF, $"country" === $"countriesAndTerritories")
        .where("countriesAndTerritories == 'Spain'")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Area",
                $"cases" / $"Area" as "infection Per Km\u00b2")
        .groupBy("country")
        .avg("infection Per Km\u00b2")
        .orderBy(desc("avg(infection Per Km²)"))

defined [32mfunction[39m [36mparquetCasesKM2[39m

## 3. Query 3: casos por densidad de población en Chile

In [22]:
def csvCasesPopulation =
parqDF.join(parqPopDF, $"country" === $"countriesAndTerritories")
        .where("countriesAndTerritories == 'Chile'")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .groupBy("country")
        .avg("infection Per Population")
        .orderBy(desc("avg(infection Per Population)"))

defined [32mfunction[39m [36mcsvCasesPopulation[39m

In [23]:
def parquetCasesPopulation =
dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
        .where("countriesAndTerritories == 'Chile'")
        .select($"country",
                $"dateRep" as "date",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .groupBy("country")
        .avg("infection Per Population")
        .orderBy(desc("avg(infection Per Population)"))

defined [32mfunction[39m [36mparquetCasesPopulation[39m

## 4. Query 4: Porcentaje diario de infecciones

In [24]:
def csvDailyCasesRate =
dfCovid.join(dfPopulation, $"country" === $"countriesAndTerritories")
                .select($"country",
                $"dateRep",
                $"day",
                $"month",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .orderBy($"dateRep".asc)

defined [32mfunction[39m [36mcsvDailyCasesRate[39m

In [25]:
def parquetDailyCasesRate =
parqDF.join(parqPopDF, $"country" === $"countriesAndTerritories")
                .select($"country",
                $"dateRep",
                $"day",
                $"month",
                $"cases",
                $"Population",
                $"cases" / $"Population" as "infection Per Population")
        .orderBy($"dateRep".asc)

defined [32mfunction[39m [36mparquetDailyCasesRate[39m

## 5. Visualización de rendimiento

In [27]:
val (x, y) = Seq(
    "parquet Mean cases Spain" -> runWithOutput(parqMeanDF.collect),
    "csv Mean cases Spain" -> runWithOutput(csvMeanDF.collect),       
    "parquet Cases KM2 Spain" -> runWithOutput(parquetCasesKM2.collect),
    "csv Cases KM2 Spain" -> runWithOutput(csvCasesKM2.collect),
    "parquet Cases Population Chile" -> runWithOutput(parquetCasesPopulation.collect),
    "csv Cases Population Chile" -> runWithOutput(csvCasesPopulation.collect),
    "parquet Daily Cases Rate" -> runWithOutput(parquetDailyCasesRate.collect),
    "csv Daily Cases Rate" -> runWithOutput(csvDailyCasesRate.collect),
).unzip

Bar(x, y).plot()

Took 151


Took 855


Took 1536


Took 2791


Took 2498


Took 1855


Took 4294


Took 2217


[36mx[39m: [32mSeq[39m[[32mString[39m] = [33mList[39m(
  [32m"parquet Mean cases Spain"[39m,
  [32m"csv Mean cases Spain"[39m,
  [32m"parquet Cases KM2 Spain"[39m,
  [32m"csv Cases KM2 Spain"[39m,
  [32m"parquet Cases Population Chile"[39m,
  [32m"csv Cases Population Chile"[39m,
  [32m"parquet Daily Cases Rate"[39m,
  [32m"csv Daily Cases Rate"[39m
)
[36my[39m: [32mSeq[39m[[32mInt[39m] = [33mList[39m([32m151[39m, [32m855[39m, [32m1536[39m, [32m2791[39m, [32m2498[39m, [32m1855[39m, [32m4294[39m, [32m2217[39m)
[36mres26_1[39m: [32mString[39m = [32m"plot-84d99ac8-5b3d-43a3-91fd-f004ad342c9d"[39m

## 6. Comparativas de rendimiento usando el framework del CERN

In [8]:
parqMeanDF.collect()

[36mres7[39m: [32mArray[39m[[32mRow[39m] = [33mArray[39m([4958.667621776504])

In [9]:
csvMeanDF.collect()

[36mres8[39m: [32mArray[39m[[32mRow[39m] = [33mArray[39m([4958.667621776504])

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    parqMeanDF
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    csvMeanDF
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    parquetCasesKM2
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    csvCasesKM2
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    parquetCasesPopulation
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    csvCasesPopulation
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    csvDailyCasesRate
    .collect()
)

In [None]:
ch.cern.sparkmeasure.StageMetrics(spark).runAndMeasure(
    parquetDailyCasesRate
    .collect()
)