# Chapter 6 - Working with different types of data

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("chapter6").getOrCreate()

In [2]:
df = spark.read.format("csv")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load("../data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [3]:
df.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [4]:
from pyspark.sql.functions import lit

df.select(lit("five"), lit(1), lit(1.0))

DataFrame[five: string, 1: int, 1.0: double]

## Working with booleans

In [5]:
from pyspark.sql.functions import col
df.where(col("InvoiceNo") != 536365).select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [6]:
# unit price > 600 or description contains POSTAGE
# DOT is in stockcode

from pyspark.sql.functions import instr

price_filter = col("UnitPrice") > 600
desc_filter = instr(col("Description"), "POSTAGE") >= 1 
df.where(df.StockCode.isin("DOT")).where(price_filter | desc_filter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [7]:
# can also use col() function since it returns a column as well
price_filter = df.UnitPrice > 600  # alternative without using col() function
desc_filter = instr(df.Description, "POSTAGE") >= 1 # alternative without using col() function
df.where(df.StockCode.isin("DOT")).where(price_filter | desc_filter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [8]:
# create a column with a boolean value
dot_filter = col("StockCode") == "DOT"
df.withColumn("IsExpensive", dot_filter & (price_filter | desc_filter)).where("IsExpensive").show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|IsExpensive|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|       true|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|       true|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+-----------+



In [9]:
df.where(col("Description").eqNullSafe("DOTCOM POSTAGE")).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



## Working with numbers

In [10]:
from pyspark.sql.functions import pow, expr

fabricated_quality = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select("CustomerID", fabricated_quality.alias("realQuantity")).show(5)

+----------+------------------+
|CustomerID|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
|   17850.0|             489.0|
|   17850.0|          418.7156|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 5 rows



In [11]:
df.stat.corr("Quantity", "UnitPrice")

-0.04112314436835551

In [12]:
from pyspark.sql.functions import corr

df.select(corr(col("Quantity"), col("UnitPrice"))).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [13]:
df.select(col("Quantity"), col("UnitPrice")).describe().show()

+-------+------------------+------------------+
|summary|          Quantity|         UnitPrice|
+-------+------------------+------------------+
|  count|              3108|              3108|
|   mean| 8.627413127413128| 4.151946589446603|
| stddev|26.371821677029203|15.638659854603892|
|    min|               -24|               0.0|
|    max|               600|            607.49|
+-------+------------------+------------------+



In [14]:
# derive the total purchase price and derive quantiles
df.withColumn("Purchase", col("Quantity") * col("UnitPrice")).approxQuantile("Purchase", [0.25, 0.5, 0.75, 0.99], 0.05)

[3.36, 8.4, 15.36, 1627.2]

In [15]:
from pyspark.sql.functions import monotonically_increasing_id

df.withColumn("unique_id", monotonically_increasing_id()).show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+---------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|unique_id|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+---------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|        0|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|        1|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|        2|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|        3|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|        4|
+---------+---------+--------------------+--------+-----

## Working with strings

In [16]:
# first letter of all words become a capital

from pyspark.sql.functions import initcap

df.select(initcap(col("Description"))).show(5)

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
|Cream Cupid Heart...|
|Knitted Union Fla...|
|Red Woolly Hottie...|
+--------------------+
only showing top 5 rows



In [17]:
from pyspark.sql.functions import upper, lower

df.select(upper(col("Description")), lower(col("Description")), upper(lower(col("Description")))).show(5)

+--------------------+--------------------+-------------------------+
|  upper(Description)|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
|CREAM CUPID HEART...|cream cupid heart...|     CREAM CUPID HEART...|
|KNITTED UNION FLA...|knitted union fla...|     KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|red woolly hottie...|     RED WOOLLY HOTTIE...|
+--------------------+--------------------+-------------------------+
only showing top 5 rows



In [18]:
from pyspark.sql.functions import ltrim, rtrim, lpad, rpad, trim

df.select(ltrim(lit("  HELLO   ")).alias("ltrim"), 
          rtrim(lit("   HELLO  ")).alias("rtrim"), 
          trim(lit("   HELLO  ")).alias("trim"), 
          lpad(lit("HELLO"), 2, ' ').alias("lp"), 
          rpad(lit("HELLO"), 20, ' ').alias("rp")).show(2)

+--------+--------+-----+---+--------------------+
|   ltrim|   rtrim| trim| lp|                  rp|
+--------+--------+-----+---+--------------------+
|HELLO   |   HELLO|HELLO| HE|HELLO               |
|HELLO   |   HELLO|HELLO| HE|HELLO               |
+--------+--------+-----+---+--------------------+
only showing top 2 rows



** On Regular Expressions **: you have a problem and you want to solve it with regular expressions, now you have two problems. I will pass. Please forgive.

## Working with dates and timestamps

In [19]:
from pyspark.sql.functions import current_date, current_timestamp

df = spark.range(10)
dateDF = df.withColumn("today", current_date()).withColumn("now", current_timestamp())
dateDF.printSchema()
dateDF.show(1)

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2018-10-14|2018-10-14 18:32:...|
+---+----------+--------------------+
only showing top 1 row



In [20]:
from pyspark.sql.functions import date_add, date_sub

dateDF.select(date_add(dateDF.today, 5), date_sub(dateDF.today, 5)).show(1)

+------------------+------------------+
|date_add(today, 5)|date_sub(today, 5)|
+------------------+------------------+
|        2018-10-19|        2018-10-09|
+------------------+------------------+
only showing top 1 row



In [21]:
from pyspark.sql.functions import datediff

dateDF.withColumn("week_ago", date_sub(col("today"), 7)).select(datediff("today", "week_ago").alias("diff")).show(1)

+----+
|diff|
+----+
|   7|
+----+
only showing top 1 row



In [22]:
from pyspark.sql.functions import months_between, to_date

# to_date does not throw an error, it returns null if cannot parse
date_format = "yyyy-MM-dd"
clean_date_df = dateDF.select(to_date(lit("2016-01-01"), date_format).alias("start"), to_date(lit("2016-05-01"), date_format).alias("end"))
clean_date_df.select(months_between(col("start"), col("end")).alias("months_diff")).show(1)

+-----------+
|months_diff|
+-----------+
|       -4.0|
+-----------+
only showing top 1 row



In [23]:
from pyspark.sql.functions import to_timestamp

clean_date_df.select(to_timestamp(col("start").alias("start_ts"))).show(1)

+-----------------------------------+
|to_timestamp(`start` AS `start_ts`)|
+-----------------------------------+
|                2016-01-01 00:00:00|
+-----------------------------------+
only showing top 1 row



### Working with missing data

In [24]:
from pyspark.sql.functions import coalesce

df = spark.read.format("csv")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load("../data/retail-data/by-day/2010-12-01.csv")
            
# returns the first row without nulls on the data
# in this case, none are them are null, so returns the first column
df.select(coalesce(col("Description"), col("CustomerID"))).show(5)

+---------------------------------+
|coalesce(Description, CustomerID)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
|             CREAM CUPID HEART...|
|             KNITTED UNION FLA...|
|             RED WOOLLY HOTTIE...|
+---------------------------------+
only showing top 5 rows



In [25]:
# drops row with na on it
print("Before dropping nan, there are {} records".format(df.count()))
print("After dropping nan, there are {} records".format(df.na.drop().count()))

Before dropping nan, there are 3108 records
After dropping nan, there are 1968 records


In [26]:
# drops row with na on it
print("Before dropping nan, there are {} records".format(df.count()))
print("After dropping nan, there are {} records".format(df.na.drop(subset=["StockCode", "InvoiceNo"]).count()))

Before dropping nan, there are 3108 records
After dropping nan, there are 3108 records


In [27]:
# fill all nans with the same value
df.na.fill("This string")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [28]:
fill_map = {"StockCode":5, "Description":"No description available"}
df.na.fill(fill_map).where(col("Description") == fill_map["Description"]).count()

10

In [29]:
df.na.replace([""], ["NOT AVAILABLE"], "Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

## Working with complex types
### Structs

In [30]:
from pyspark.sql.functions import struct

complex_df = df.select (struct("InvoiceNo", "Description").alias("complex"))
complex_df.show(1)

+--------------------+
|             complex|
+--------------------+
|[536365, WHITE HA...|
+--------------------+
only showing top 1 row



In [31]:
complex_df.select("complex.Description").show(2)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows



In [32]:
complex_df.select(col("complex").getField("Description")).show(2)

+--------------------+
| complex.Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows



In [33]:
complex_df.select(col("complex.*")).show(2)

+---------+--------------------+
|InvoiceNo|         Description|
+---------+--------------------+
|   536365|WHITE HANGING HEA...|
|   536365| WHITE METAL LANTERN|
+---------+--------------------+
only showing top 2 rows



### Array

In [34]:
from pyspark.sql.functions import split

df.select(split(col("Description"), " ")).show(2)

+---------------------+
|split(Description,  )|
+---------------------+
| [WHITE, HANGING, ...|
| [WHITE, METAL, LA...|
+---------------------+
only showing top 2 rows



In [35]:
df.select(split(col("Description"), " ").alias("array_field")).selectExpr("array_field[0]").show(2)

+--------------+
|array_field[0]|
+--------------+
|         WHITE|
|         WHITE|
+--------------+
only showing top 2 rows



In [36]:
from pyspark.sql.functions import size

df.select(size(split(col("Description"), " ").alias("array_field")).alias("size_of_array")).show(2)

+-------------+
|size_of_array|
+-------------+
|            5|
|            3|
+-------------+
only showing top 2 rows



In [37]:
from pyspark.sql.functions import array_contains

df.select(array_contains(split(col("Description"), " ").alias("array_field"), "WHITE").alias("contains_white")).show(2)

+--------------+
|contains_white|
+--------------+
|          true|
|          true|
+--------------+
only showing top 2 rows



In [38]:
from pyspark.sql.functions import explode

df.withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "splitted", "exploded").show(2)

+--------------------+--------------------+--------+
|         Description|            splitted|exploded|
+--------------------+--------------------+--------+
|WHITE HANGING HEA...|[WHITE, HANGING, ...|   WHITE|
|WHITE HANGING HEA...|[WHITE, HANGING, ...| HANGING|
+--------------------+--------------------+--------+
only showing top 2 rows



### Maps

In [39]:
from pyspark.sql.functions import create_map

df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2)

+--------------------+
|         complex_map|
+--------------------+
|[WHITE HANGING HE...|
|[WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



In [40]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
+--------------------------------+
only showing top 2 rows



In [41]:
# exploding map types will convert them into columns
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("explode(complex_map)").show(2)

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
+--------------------+------+
only showing top 2 rows



## User defined functions

Note that you can register a function in Scala (in a separate project), create a jar file and register it in python, in order to gain performance (see https://medium.com/wbaa/using-scala-udfs-in-pyspark-b70033dd69b9)

In [42]:
udf_df = spark.range(5).toDF("num")

def power3 (x):
    return x**3

power3(2)

8

In [43]:
from pyspark.sql.functions import udf

power3udf = udf(power3)

udf_df.select(power3udf("num")).show()

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
|         27|
|         64|
+-----------+



In [44]:
from pyspark.sql.types import IntegerType, DoubleType

spark.udf.register("power3py", power3, DoubleType())

# does not work as range creates integers and power3 expects doubletype
udf_df.selectExpr("power3py(num)").show()

+-------------+
|power3py(num)|
+-------------+
|         null|
|         null|
|         null|
|         null|
|         null|
+-------------+



In [45]:
spark.udf.register("power3py", power3, IntegerType())
udf_df.selectExpr("power3py(num)").show()

+-------------+
|power3py(num)|
+-------------+
|            0|
|            1|
|            8|
|           27|
|           64|
+-------------+

