In [1]:
try:
    !pip install pyspark=="2.4.5"  --quiet
except:
 print("Running throw py file.")

In [10]:
from pyspark import SparkContext as sc
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkFiles
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, ArrayType
import pyspark

In [3]:
spark = SparkSession\
        .builder\
        .appName("Estudo Spark - UDF Functions - Fabio Kfouri")\
        .getOrCreate()

## Leitura de dados usando uma fonte pública

Fonte de Dados:

https://data.humdata.org/dataset/faostat-prices-for-brazil

In [4]:
spark.sparkContext.addFile('https://data.humdata.org/dataset/bdf7bcca-28ae-47c5-8993-c87a7c5c04c0/resource/c16c15f5-efaf-4950-b848-94935987d312/download/producer-prices_bra.csv')

In [5]:
df = spark.read.csv(SparkFiles.get("producer-prices_bra.csv"), header = True, sep = ",")

In [6]:
df.show(5, False)

+-------------+-----------+----------+---------+-------------+---------------+---------------+------------+--------------------------+---------+----------+-----------+------------+---------------+--------------------+----+
|Iso3         |StartDate  |EndDate   |Area Code|Area         |Item Code      |Item           |Element Code|Element                   |Year Code|Year      |Months Code|Months      |Unit           |Value               |Flag|
+-------------+-----------+----------+---------+-------------+---------------+---------------+------------+--------------------------+---------+----------+-----------+------------+---------------+--------------------+----+
|#country+code|#date+start|#date+end |null     |#country+name|#indicator+code|#indicator+name|null        |null                      |null     |#date+year|null       |null        |#indicator+type|#indicator+value+num|null|
|BRA          |1991-01-01 |1991-12-31|21       |Brazil       |515            |Apples         |5530        |P

In [7]:
df.printSchema()

root
 |-- Iso3: string (nullable = true)
 |-- StartDate: string (nullable = true)
 |-- EndDate: string (nullable = true)
 |-- Area Code: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- Item Code: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Element Code: string (nullable = true)
 |-- Element: string (nullable = true)
 |-- Year Code: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Months Code: string (nullable = true)
 |-- Months: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Flag: string (nullable = true)



In [8]:
df = df.filter("Iso3 <> '#country+code'")\
    .withColumn('Value',df["Value"].cast('float'))\
    .withColumn('StartDate', F.to_date('StartDate', 'yyyy-MM-dd'))\
    .withColumn('EndDate', F.to_date('EndDate', 'yyyy-MM-dd'))

In [9]:
df.printSchema()

root
 |-- Iso3: string (nullable = true)
 |-- StartDate: date (nullable = true)
 |-- EndDate: date (nullable = true)
 |-- Area Code: string (nullable = true)
 |-- Area: string (nullable = true)
 |-- Item Code: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Element Code: string (nullable = true)
 |-- Element: string (nullable = true)
 |-- Year Code: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Months Code: string (nullable = true)
 |-- Months: string (nullable = true)
 |-- Unit: string (nullable = true)
 |-- Value: float (nullable = true)
 |-- Flag: string (nullable = true)



## Criando um boolean UDF

In [11]:
short_udf = F.udf(lambda x:\
                 True if not x or len(x) < 6 else False, BooleanType())

In [14]:
df.select(short_udf('item')).alias("is short").distinct().show()

+--------------+
|<lambda>(item)|
+--------------+
|          true|
|         false|
+--------------+



In [21]:
df.select("item").distinct().show(10,False)

+------------------------+
|item                    |
+------------------------+
|Meat live weight, pig   |
|Fruit excl Melons, Total|
|Fibre Crops Primary     |
|Milk, whole fresh cow   |
|Pineapples              |
|Meat, turkey            |
|Potatoes                |
|Roots and Tubers, Total |
|Coffee, green           |
|Cotton lint             |
+------------------------+
only showing top 10 rows



## Trabalhando com array

In [27]:
df1 = df.withColumn('words', F.split('item', '[ ]'))

In [29]:
df1.show(4)

+----+----------+----------+---------+------+---------+------+------------+--------------------+---------+----+-----------+------------+----+---------+----+--------+
|Iso3| StartDate|   EndDate|Area Code|  Area|Item Code|  Item|Element Code|             Element|Year Code|Year|Months Code|      Months|Unit|    Value|Flag|   words|
+----+----------+----------+---------+------+---------+------+------------+--------------------+---------+----+-----------+------------+----+---------+----+--------+
| BRA|1991-01-01|1991-12-31|       21|Brazil|      515|Apples|        5530|Producer Price (L...|     1991|1991|       7021|Annual value| LCU| 105691.0|   *|[Apples]|
| BRA|1992-01-01|1992-12-31|       21|Brazil|      515|Apples|        5530|Producer Price (L...|     1992|1992|       7021|Annual value| LCU|1969620.0|   *|[Apples]|
| BRA|1993-01-01|1993-12-31|       21|Brazil|      515|Apples|        5530|Producer Price (L...|     1993|1993|       7021|Annual value| LCU|  26770.0|   *|[Apples]|
| BR