# Columns & Expressions*

In [1]:
spark

Intitializing Scala interpreter ...

Spark Web UI available at http://5fcc63a3e7cf:4041
SparkContext available as 'sc' (version = 3.2.1, master = local[*], app id = local-1654181211879)
SparkSession available as 'spark'


res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1d71bf29


In [2]:
sc

res1: org.apache.spark.SparkContext = org.apache.spark.SparkContext@27429817


In [4]:
import org.apache.spark.sql.functions._

import org.apache.spark.sql.functions._


## Cosas del Notebook anterior

In [6]:
import org.apache.spark.sql.types._

import org.apache.spark.sql.types._


In [31]:
// CREO MI SCHEMA
val schema = StructType(
    Array(
    StructField("Id",IntegerType,false), //Creo que el falso indica si puede o no ser nulo
    StructField("First",StringType,false),
    StructField("Last",StringType,false),
    StructField("Url",StringType,false),
    StructField("Published",StringType,false),
    StructField("Hits",IntegerType,false),
    StructField("Campaigns",ArrayType(StringType),false)
    )
    )

schema: org.apache.spark.sql.types.StructType = StructType(StructField(Id,IntegerType,false), StructField(First,StringType,false), StructField(Last,StringType,false), StructField(Url,StringType,false), StructField(Published,StringType,false), StructField(Hits,IntegerType,false), StructField(Campaigns,ArrayType(StringType,true),false))


In [32]:
val dfBlog = spark.read.schema(schema).json("blogs.json")

dfBlog: org.apache.spark.sql.DataFrame = [Id: int, First: string ... 5 more fields]


In [33]:
dfBlog.show(truncate = false)

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+



# ¿Qué podemos hacer con las columnas de un DataFrame en spark?

In [11]:
// VEamos sus columnas
dfBlog.columns

res4: Array[String] = Array(Id, First, Last, Url, Published, Hits, Campaigns)


In [12]:
dfBlog.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: date (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [16]:
dfBlog.col("Id")

res9: org.apache.spark.sql.Column = Id


In [22]:
// Vamos a utilizar expr para multiplicar
dfBlog.select(expr("Hits*2")).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [23]:
//Equivalentemente
dfBlog.select($"Hits"*2).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [27]:
// USAR $"..." es lo mismo que col("...")
dfBlog.select(col("Hits")*2).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



### Usar *2 con string


In [29]:
dfBlog.select(col("Last")*2).show()

+----------+
|(Last * 2)|
+----------+
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
+----------+



In [30]:
// devuelve nulo

### Usar expresiones para condiciones

In [41]:
val df1 = dfBlog.withColumnRenamed("Last","Apellido")

df1: org.apache.spark.sql.DataFrame = [Id: int, First: string ... 5 more fields]


In [42]:
df1.printSchema

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Apellido: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [45]:
dfBlog.withColumn("NombreCol", expr("Hits > 10000")).show()

+---+---------+-------+-----------------+---------+-----+--------------------+---------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|NombreCol|
+---+---------+-------+-----------------+---------+-----+--------------------+---------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|    false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|    false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|    false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|     true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|     true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|     true|
+---+---------+-------+-----------------+---------+-----+--------------------+---------+



### Concatenar cols

In [50]:
dfBlog.withColumn("Concatenacion",concat($"First",$"Last")).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Concatenacion|
+---+---------+-------+-----------------+---------+-----+--------------------+-------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|   JulesDamji|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|  BrookeWenig|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|     DennyLee|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]| TathagataDas|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...| MateiZaharia|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|   ReynoldXin|
+---+---------+-------+-----------------+---------+-----+--------------------+-------------+



In [56]:
dfBlog.withColumn("Concatenacion",concat(expr("First"),expr("Last"))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Concatenacion|
+---+---------+-------+-----------------+---------+-----+--------------------+-------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|   JulesDamji|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|  BrookeWenig|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|     DennyLee|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]| TathagataDas|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...| MateiZaharia|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|   ReynoldXin|
+---+---------+-------+-----------------+---------+-----+--------------------+-------------+



### Similitud entre métodos

In [62]:
dfBlog.select("Hits").show(2)
dfBlog.select(col("Hits")).show(2)
dfBlog.select(expr("Hits")).show(2)
dfBlog.select($"Hits").show(2)

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows



### Ordenar DF

In [68]:
dfBlog.sort($"Id".desc).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



Mirar todo mejor en el archivo Chapter3.ipynb