In [1]:
import findspark
findspark.init("/opt/spark")

from pyspark.sql import SparkSession 

In [2]:
spark = (
    SparkSession
        .builder
        .master('local[4]') 
        .appName('mi_app')
        .getOrCreate()
)

24/09/05 18:15:25 WARN Utils: Your hostname, zenlo resolves to a loopback address: 127.0.1.1, but we couldn't find any external IP address!
24/09/05 18:15:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/05 18:15:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/05 18:15:26 WARN MacAddressUtil: Failed to find a usable hardware address from the network interfaces; using random bytes: 28:87:80:ea:ef:88:74:0b


In [3]:
df = spark.read.format('json').load('/home/gonzalo/Documents/notes/notes-spark/data/flight-data/json/2015-summary.json')

                                                                                

In [4]:
df.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [17]:
# se define el tipo complex
# se toma col desde un complex
from pyspark.sql import functions as F

complex = (
    df
    .withColumn('complex', F.struct('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME'))
    .withColumn('get_field', F.col('complex').getField('ORIGIN_COUNTRY_NAME'))
    .withColumn('get_field_2', F.col('complex.DEST_COUNTRY_NAME'))
    #.withColumn('get_field_3', F.col('complex.*'))
)

complex.show(2)

+-----------------+-------------------+-----+--------------------+---------+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|             complex|get_field|  get_field_2|
+-----------------+-------------------+-----+--------------------+---------+-------------+
|    United States|            Romania|   15|{United States, R...|  Romania|United States|
|    United States|            Croatia|    1|{United States, C...|  Croatia|United States|
+-----------------+-------------------+-----+--------------------+---------+-------------+
only showing top 2 rows



In [18]:
complex.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)
 |-- complex: struct (nullable = false)
 |    |-- DEST_COUNTRY_NAME: string (nullable = true)
 |    |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- get_field: string (nullable = true)
 |-- get_field_2: string (nullable = true)



In [19]:
# This brings up all the columns to the top-level df.

complex = (
    df
    .withColumn('complex', F.struct('DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME'))
    .select('complex.*')
)

complex.show(2)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows



In [27]:
# This give us an array type
# A way to get item from the array


my_array = (
    df
    .withColumn('splitted', F.split(F.col('DEST_COUNTRY_NAME'), ' '))
    .withColumn('get_items', F.col('splitted').getItem(1))
    .withColumn('get_size', F.array_size(F.col('splitted')))
    .withColumn('array_contains', F.array_contains(F.col('splitted'), 'United'))
)

my_array.show(2)

+-----------------+-------------------+-----+----------------+---------+--------+--------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|        splitted|get_items|get_size|array_contains|
+-----------------+-------------------+-----+----------------+---------+--------+--------------+
|    United States|            Romania|   15|[United, States]|   States|       2|          true|
|    United States|            Croatia|    1|[United, States]|   States|       2|          true|
+-----------------+-------------------+-----+----------------+---------+--------+--------------+
only showing top 2 rows



In [28]:
my_array.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)
 |-- splitted: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- get_items: string (nullable = true)
 |-- get_size: integer (nullable = true)
 |-- array_contains: boolean (nullable = true)



In [31]:
my_map = (
    df
    .withColumn('create_map', F.create_map(F.col('DEST_COUNTRY_NAME'), F.col('count')))
)

my_map.show(2)

+-----------------+-------------------+-----+--------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|          create_map|
+-----------------+-------------------+-----+--------------------+
|    United States|            Romania|   15|{United States ->...|
|    United States|            Croatia|    1|{United States -> 1}|
+-----------------+-------------------+-----+--------------------+
only showing top 2 rows



In [32]:
my_map.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)
 |-- create_map: map (nullable = false)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)



In [43]:
# we can get values by key, like a normal dict.
my_map_2 = (
    df
    .select(F.create_map(F.col('DEST_COUNTRY_NAME'), F.col('ORIGIN_COUNTRY_NAME')).alias('my_map'))
    .selectExpr("my_map['United States']")
    )

my_map_2.show(10, False)

+---------------------+
|my_map[United States]|
+---------------------+
|Romania              |
|Croatia              |
|Ireland              |
|NULL                 |
|India                |
|Singapore            |
|Grenada              |
|NULL                 |
|NULL                 |
|NULL                 |
+---------------------+
only showing top 10 rows

