# High-Order Functions
in DataFrames and SPARK SQL

Normalmente necesitan el uso de funciones como:

- `get_json_object()`
- `from_json()`
- `to_json()`
- `explode()`
- `selectExpr()`


### Explode and Collect

In [6]:
spark

res5: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@f7b6ea9


In [7]:
spark.range(1,9).createOrReplaceTempView("prueba")

In [8]:
spark.sql("select id from prueba").show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
+---+



In [9]:
spark.sql(""" select id,  as values
              from prueba """).show()

org.apache.spark.sql.AnalysisException:  cannot resolve 'as' given input columns: [prueba.id]; line 1 pos 13;

In [12]:
import spark.implicits._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
val pruebaDF = Seq(
    Row("James", "Java"), Row("James", "C#"),Row("James", "Python"),
    Row("Michael", "Java"),Row("Michael", "PHP"),Row("Michael", "PHP"),
    Row("Robert", "Java"),Row("Robert", "Java"),Row("Robert", "Java"),
    Row("Washington", null)
  )
val arrayStructSchema = new StructType().add("name", StringType)
    .add("booksInterested", StringType)

val df = spark.createDataFrame(
    spark.sparkContext.parallelize(arrayStructData),arrayStructSchema)
  df.printSchema()
  df.show(false)

<console>: 16: error: ')' expected but '.' found.

In [30]:
val df2 = df.groupBy("name").agg(collect_list("booksInterested").as("booksInterested"))
df2.printSchema()
df2.show(false)

root
 |-- name: string (nullable = true)
 |-- booksInterested: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+------------------+
|name      |booksInterested   |
+----------+------------------+
|James     |[Java, C#, Python]|
|Michael   |[Java, PHP, PHP]  |
|Robert    |[Java, Java, Java]|
|Washington|[]                |
+----------+------------------+



df2: org.apache.spark.sql.DataFrame = [name: string, booksInterested: array<string>]


In [32]:
val df3 = df.groupBy("name").agg(collect_set("booksInterested").as("booksInterested"))
df3.printSchema()
df3.show(false)

root
 |-- name: string (nullable = true)
 |-- booksInterested: array (nullable = false)
 |    |-- element: string (containsNull = false)

+----------+------------------+
|name      |booksInterested   |
+----------+------------------+
|James     |[Java, C#, Python]|
|Michael   |[PHP, Java]       |
|Robert    |[Java]            |
|Washington|[]                |
+----------+------------------+



df3: org.apache.spark.sql.DataFrame = [name: string, booksInterested: array<string>]


In [38]:
df3.explode("booksInterested").show()

<console>: 42: error: type mismatch;

In [11]:
    import spark.implicits._

    val arrayData = Seq(
      Row("James",List("Java","Scala"),Map("hair"->"black","eye"->"brown")),
    Row("Michael",List("Spark","Java",null),Map("hair"->"brown","eye"->null)),
    Row("Robert",List("CSharp",""),Map("hair"->"red","eye"->"")),
    Row("Washington",null,null),
    Row("Jefferson",List(),Map())
    )

    val arraySchema = new StructType()
      .add("name",StringType)
      .add("knownLanguages", ArrayType(StringType))
      .add("properties", MapType(StringType,StringType))

    val df = spark.createDataFrame(spark.sparkContext.parallelize(arrayData),arraySchema)
    df.printSchema()
    df.show(false)

<console>: 26: error: not found: value Row

In [40]:
df.select($"name",explode($"properties"))
      .show(false)

+-------+----+-----+
|name   |key |value|
+-------+----+-----+
|James  |hair|black|
|James  |eye |brown|
|Michael|hair|brown|
|Michael|eye |null |
|Robert |hair|red  |
|Robert |eye |     |
+-------+----+-----+



In [43]:
df.select($"name",explode($"knownLanguages"))
      .show(false)

+-------+------+
|name   |col   |
+-------+------+
|James  |Java  |
|James  |Scala |
|Michael|Spark |
|Michael|Java  |
|Michael|null  |
|Robert |CSharp|
|Robert |      |
+-------+------+



### Añadir el siguiente

In [44]:
spark.range(1,8).createOrReplaceTempView("prueba2")

In [46]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |   prueba|       true|
|         |  prueba2|       true|
+---------+---------+-----------+



In [47]:
val mas1 = (s: Int) => {s+1}

mas1: Int => Int = $Lambda$4525/0x00000008415c4040@78b59db3


In [49]:
spark.udf.register("mas1",mas1)

res40: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4525/0x00000008415c4040@78b59db3,IntegerType,List(Some(class[value[0]: int])),Some(class[value[0]: int]),Some(mas1),false,true)


In [50]:
spark.sql("""select id, mas1(id) from prueba2""").show()

+---+--------+
| id|mas1(id)|
+---+--------+
|  1|       2|
|  2|       3|
|  3|       4|
|  4|       5|
|  5|       6|
|  6|       7|
|  7|       8|
+---+--------+



## Built-in functions for complex data Types

### Array types

In [52]:
spark.sql("select array_distinct( array(1,2,3,null,3) )").show()

+---------------------------------------+
|array_distinct(array(1, 2, 3, NULL, 3))|
+---------------------------------------+
|                        [1, 2, 3, null]|
+---------------------------------------+



In [54]:
spark.sql("SELECT array_intersect( array(1, 2, 3), array(1,3,5)  )" ).show()

+-----------------------------------------------+
|array_intersect(array(1, 2, 3), array(1, 3, 5))|
+-----------------------------------------------+
|                                         [1, 3]|
+-----------------------------------------------+



In [62]:
spark.sql(" SELECT array_union(  array(1,2,2,3), array(1, 3, 5) )").show()

+----------------------------------------------+
|array_union(array(1, 2, 2, 3), array(1, 3, 5))|
+----------------------------------------------+
|                                  [1, 2, 3, 5]|
+----------------------------------------------+



In [61]:
spark.sql(" SELECT array_except( array(1,2,2,3), array(1,4) )").show()

+--------------------------------------------+
|array_except(array(1, 2, 2, 3), array(1, 4))|
+--------------------------------------------+
|                                      [2, 3]|
+--------------------------------------------+



In [66]:
spark.sql(" SELECT array_join( array('hello','world'), ',')").show()

+----------------------------------+
|array_join(array(hello, world), ,)|
+----------------------------------+
|                       hello,world|
+----------------------------------+



In [70]:
spark.sql("SELECT array_max( array(1, 20, null, 3))").show()

+--------------------------------+
|array_max(array(1, 20, NULL, 3))|
+--------------------------------+
|                              20|
+--------------------------------+



In [71]:
spark.sql("SELECT array_min( array(1, 20, null, 3))").show()

+--------------------------------+
|array_min(array(1, 20, NULL, 3))|
+--------------------------------+
|                               1|
+--------------------------------+



In [79]:
spark.sql("SELECT array_position(array(3,2,1,2,3,1,2,31,4,1), 31)").show()

+--------------------------------------------------------+
|array_position(array(3, 2, 1, 2, 3, 1, 2, 31, 4, 1), 31)|
+--------------------------------------------------------+
|                                                       8|
+--------------------------------------------------------+



In [80]:
spark.sql("SELECT array_remove(array(1,2, 3, null, 3), 3)").show()

+----------------------------------------+
|array_remove(array(1, 2, 3, NULL, 3), 3)|
+----------------------------------------+
|                            [1, 2, null]|
+----------------------------------------+



In [82]:
spark.sql("SELECT arrays_overlap( array(1,2,3), array(3, 4, 5) )").show()
spark.sql("SELECT arrays_overlap( array(1,2,3,3), array( 4, 5) )").show()

+----------------------------------------------+
|arrays_overlap(array(1, 2, 3), array(3, 4, 5))|
+----------------------------------------------+
|                                          true|
+----------------------------------------------+

+----------------------------------------------+
|arrays_overlap(array(1, 2, 3, 3), array(4, 5))|
+----------------------------------------------+
|                                         false|
+----------------------------------------------+



In [85]:
spark.sql("SELECT array_sort(array('b','d',2,3, null, 'c', 'a'))").show(false)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|array_sort(array(b, d, 2, 3, NULL, c, a), lambdafunction((IF(((namedlambdavariable() IS NULL) AND (namedlambdavariable() IS NULL)), 0, (IF((namedlambdavariable() IS NULL), 1, (IF((namedlambdavariable() IS NULL), -1, (IF((namedlambdavariable() < namedlambdavariable()), -1, (IF((namedlambdavariable() > namedlambdavariable()), 1, 0)))))))))), namedlambdavariable(), namedlambdavariable()))|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [86]:
spark.sql("SELECT concat(array(1, 2, 3), array(4, 5), array(6))").show()

+---------------------------------------------+
|concat(array(1, 2, 3), array(4, 5), array(6))|
+---------------------------------------------+
|                           [1, 2, 3, 4, 5, 6]|
+---------------------------------------------+



In [91]:
spark.sql("SELECT flatten(  array( array(1,2,3), array(3, 4)))").show()

+-------------------------------------------+
|flatten(array(array(1, 2, 3), array(3, 4)))|
+-------------------------------------------+
|                            [1, 2, 3, 3, 4]|
+-------------------------------------------+



In [92]:
spark.sql("SELECT array_repeat('123', 3)").show()

+--------------------+
|array_repeat(123, 3)|
+--------------------+
|     [123, 123, 123]|
+--------------------+



In [93]:
spark.sql("SELECT reverse(array(2, 1, 4, 3))").show()

+--------------------------+
|reverse(array(2, 1, 4, 3))|
+--------------------------+
|              [3, 4, 1, 2]|
+--------------------------+



In [98]:
spark.sql("""SELECT sequence(1, 5)""").show()
spark.sql("""SELECT sequence(5, 1)""").show()
spark.sql("""SELECT sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month)""").show(false)

+---------------+
| sequence(1, 5)|
+---------------+
|[1, 2, 3, 4, 5]|
+---------------+

+---------------+
| sequence(5, 1)|
+---------------+
|[5, 4, 3, 2, 1]|
+---------------+

+----------------------------------------------------------------------+
|sequence(to_date(2018-01-01), to_date(2018-03-01), INTERVAL '1' MONTH)|
+----------------------------------------------------------------------+
|[2018-01-01, 2018-02-01, 2018-03-01]                                  |
+----------------------------------------------------------------------+



In [103]:
spark.sql("SELECT shuffle(array(1, 20, null, 3))").show()

+------------------------------+
|shuffle(array(1, 20, NULL, 3))|
+------------------------------+
|              [1, null, 3, 20]|
+------------------------------+



In [106]:
spark.sql("SELECT slice(array(1, 2, 3, 4), -2, 2)").show()
spark.sql("SELECT slice(array(1, 2, 3, 4,5,6,7,8,9), -2, 2)").show()
spark.sql("SELECT slice(array(1, 2, 3, 4,5,6,7,8,9), 4, 3)").show()

+-------------------------------+
|slice(array(1, 2, 3, 4), -2, 2)|
+-------------------------------+
|                         [3, 4]|
+-------------------------------+

+----------------------------------------------+
|slice(array(1, 2, 3, 4, 5, 6, 7, 8, 9), -2, 2)|
+----------------------------------------------+
|                                        [8, 9]|
+----------------------------------------------+

+---------------------------------------------+
|slice(array(1, 2, 3, 4, 5, 6, 7, 8, 9), 4, 3)|
+---------------------------------------------+
|                                    [4, 5, 6]|
+---------------------------------------------+



In [109]:
spark.sql("""SELECT arrays_zip(array(1, 2),array(2, 3), array(3, 4))""").show(false)

+-------------------------------------------------+
|arrays_zip(array(1, 2), array(2, 3), array(3, 4))|
+-------------------------------------------------+
|[{1, 2, 3}, {2, 3, 4}]                           |
+-------------------------------------------------+



In [111]:
spark.sql("SELECT element_at(array(1, 2, 3,3,12,2,23,5,56,6,34,43,123,4), 7)").show()

+---------------------------------------------------------------------+
|element_at(array(1, 2, 3, 3, 12, 2, 23, 5, 56, 6, 34, 43, 123, 4), 7)|
+---------------------------------------------------------------------+
|                                                                   23|
+---------------------------------------------------------------------+



In [112]:
spark.sql("SELECT cardinality(array('b','d', 'c', 'a'))").show()

+------------------------------+
|cardinality(array(b, d, c, a))|
+------------------------------+
|                             4|
+------------------------------+



### Map functions

In [113]:
spark.sql("SELECT map_from_arrays(array(1.0,3.0), array('2', '4'))").show()

+---------------------------------------------+
|map_from_arrays(array(1.0, 3.0), array(2, 4))|
+---------------------------------------------+
|                         {1.0 -> 2, 3.0 -> 4}|
+---------------------------------------------+



In [114]:
spark.sql("SELECT map_from_entries(array(struct(1,'a'), struct(2, 'b')))").show()

+---------------------------------------------------+
|map_from_entries(array(struct(1, a), struct(2, b)))|
+---------------------------------------------------+
|                                   {1 -> a, 2 -> b}|
+---------------------------------------------------+



In [121]:
spark.sql("SELECT map_concat( map(1, 'a', 2, 'b'), map(4, 'c', 3, 'd') )").show(false)

+--------------------------------------------+
|map_concat(map(1, a, 2, b), map(4, c, 3, d))|
+--------------------------------------------+
|{1 -> a, 2 -> b, 4 -> c, 3 -> d}            |
+--------------------------------------------+



In [122]:
spark.sql("SELECT element_at(map(1, 'a', 2, 'b'), 2)").show()

+------------------------------+
|element_at(map(1, a, 2, b), 2)|
+------------------------------+
|                             b|
+------------------------------+



In [123]:
spark.sql("SELECT cardinality(map_concat(map(1, 'a', 2, 'b'),map(3,'c',4,'c')))").show()

+---------------------------------------------------------+
|cardinality(map_concat(map(1, a, 2, b), map(3, c, 4, c)))|
+---------------------------------------------------------+
|                                                        4|
+---------------------------------------------------------+



## High Order Functions HOF

`in SQL`

`transform(values, value->lambda expression)`

Creamos un DataFrame para poner en practica la función anterior

In [14]:
// Create DataFrame with two rows of two arrays (tempc1, tempc2)
val t1 = Array(35, 36, 32, 30, 40, 42, 38)
val t2 = Array(31, 32, 34, 55, 56)
val tC = Seq(t1, t2).toDF("celsius")
tC.createOrReplaceTempView("tC")

t1: Array[Int] = Array(35, 36, 32, 30, 40, 42, 38)
t2: Array[Int] = Array(31, 32, 34, 55, 56)
tC: org.apache.spark.sql.DataFrame = [celsius: array<int>]


In [15]:
tC.show(false)

+----------------------------+
|celsius                     |
+----------------------------+
|[35, 36, 32, 30, 40, 42, 38]|
|[31, 32, 34, 55, 56]        |
+----------------------------+



### `transform`

In [16]:
spark.sql(""" select celsius,
                     transform(celsius, t -> (t*9) div 5 + 32) as fahrenheit,
                     transform(celsius, t -> t+273)            as kelvin
              from tC
          """).show(false)

+----------------------------+-------------------------------+-----------------------------------+
|celsius                     |fahrenheit                     |kelvin                             |
+----------------------------+-------------------------------+-----------------------------------+
|[35, 36, 32, 30, 40, 42, 38]|[95, 96, 89, 86, 104, 107, 100]|[308, 309, 305, 303, 313, 315, 311]|
|[31, 32, 34, 55, 56]        |[87, 89, 93, 131, 132]         |[304, 305, 307, 328, 329]          |
+----------------------------+-------------------------------+-----------------------------------+



### `filter`

In [17]:
spark.sql(""" select celsius,
                     filter(celsius, t -> t>38) as altas
              from tC
          """).show(false)

+----------------------------+--------+
|celsius                     |altas   |
+----------------------------+--------+
|[35, 36, 32, 30, 40, 42, 38]|[40, 42]|
|[31, 32, 34, 55, 56]        |[55, 56]|
+----------------------------+--------+



### `exists`

In [20]:
spark.sql("""select celsius,
                   exists(celsius, t -> t=38) as prueba38
             from tC""").show(false)

+----------------------------+--------+
|celsius                     |prueba38|
+----------------------------+--------+
|[35, 36, 32, 30, 40, 42, 38]|true    |
|[31, 32, 34, 55, 56]        |false   |
+----------------------------+--------+



### `reduce`

In [19]:
spark.sql(""" select celsius,
                     reduce(celsius,
                            0,
                            (t,acc) -> t + acc ,
                            acc     -> (acc div size(celsius) * 9 div 5) + 32  
                            ) as avgFah
              from tC
          """).show(false)

org.apache.spark.sql.AnalysisException:  Undefined function: 'reduce'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 2 pos 21