In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [0]:
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [0]:
spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "10g")
    .config("spark.sql.files.maxPartitionBytes", "268435456")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .master("local[*]")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [0]:
customers_file = "dbfs:/FileStore/customer"
df_customers = spark.read.parquet(customers_file)

In [0]:
df_customers.show(5, False)

+----------+-------------+---+------+----------+-----+-----------+
|cust_id   |name         |age|gender|birthday  |zip  |city       |
+----------+-------------+---+------+----------+-----+-----------+
|C007YEYTX9|Aaron Abbott |34 |Female|7/13/1991 |97823|boston     |
|C00B971T1J|Aaron Austin |37 |Female|12/16/2004|30332|chicago    |
|C00WRSJF1Q|Aaron Barnes |29 |Female|3/11/1977 |23451|denver     |
|C01AZWQMF3|Aaron Barrett|31 |Male  |7/9/1998  |46613|los_angeles|
|C01BKUFRHA|Aaron Becker |54 |Male  |11/24/1979|40284|san_diego  |
+----------+-------------+---+------+----------+-----+-----------+
only showing top 5 rows



In [0]:
df_base = (
    df_customers
    .filter(F.col("city") == "boston")
    .withColumn(
        "customer_group", 
        F.when(
            F.col("age").between(20, 30), 
            F.lit("young") 
        )
        .when(
            F.col("age").between(31, 50), 
            F.lit("mid") 
        )
        .when(
            F.col("age") > 51, 
            F.lit("old") 
        )
        .otherwise(F.lit("kid"))
     )
    .select("cust_id", "name", "age", "gender", "birthday", "zip", "city", "customer_group")
)

df_base.cache() 
df_base.show(5, False)

Out[16]: DataFrame[cust_id: string, name: string, age: string, gender: string, birthday: string, zip: string, city: string, customer_group: string]+----------+--------------+---+------+---------+-----+------+--------------+
|cust_id   |name          |age|gender|birthday |zip  |city  |customer_group|
+----------+--------------+---+------+---------+-----+------+--------------+
|C007YEYTX9|Aaron Abbott  |34 |Female|7/13/1991|97823|boston|mid           |
|C08XAQUY73|Aaron Lambert |54 |Female|11/5/1966|75218|boston|old           |
|C094P1VXF9|Aaron Lindsey |24 |Male  |9/21/1990|29399|boston|young         |
|C097SHE1EF|Aaron Lopez   |22 |Female|4/18/2001|82129|boston|young         |
|C0DTC6436T|Aaron Schwartz|52 |Female|7/9/1962 |57192|boston|old           |
+----------+--------------+---+------+---------+-----+------+--------------+
only showing top 5 rows



In [0]:
#df_base.unpersist()

Out[13]: DataFrame[cust_id: string, name: string, age: string, gender: string, birthday: string, zip: string, city: string, customer_group: string]

In [0]:
df1 = (
    df_base
    .withColumn("test_column_1", F.lit("test_column_1"))
    .withColumn("birth_year", F.split("birthday", "/").getItem(2))
)

df1.explain(True)
df1.show(5, False)

== Parsed Logical Plan ==
'Project [cust_id#2, name#3, age#4, gender#5, birthday#6, zip#7, city#8, customer_group#697, test_column_1#1303, split('birthday, /, -1)[2] AS birth_year#1313]
+- Project [cust_id#2, name#3, age#4, gender#5, birthday#6, zip#7, city#8, customer_group#697, test_column_1 AS test_column_1#1303]
   +- Project [cust_id#2, name#3, age#4, gender#5, birthday#6, zip#7, city#8, customer_group#697]
      +- Project [cust_id#2, name#3, age#4, gender#5, birthday#6, zip#7, city#8, CASE WHEN ((cast(age#4 as int) >= 20) AND (cast(age#4 as int) <= 30)) THEN young WHEN ((cast(age#4 as int) >= 31) AND (cast(age#4 as int) <= 50)) THEN mid WHEN (cast(age#4 as int) > 51) THEN old ELSE kid END AS customer_group#697]
         +- Filter (city#8 = boston)
            +- Relation [cust_id#2,name#3,age#4,gender#5,birthday#6,zip#7,city#8] parquet

== Analyzed Logical Plan ==
cust_id: string, name: string, age: string, gender: string, birthday: string, zip: string, city: string, customer_gr

In [0]:
df_base.unpersist()

Out[26]: DataFrame[cust_id: string, name: string, age: string, gender: string, birthday: string, zip: string, city: string, customer_group: string]

In [0]:
df_base.persist(StorageLevel.DISK_ONLY)

Out[27]: DataFrame[cust_id: string, name: string, age: string, gender: string, birthday: string, zip: string, city: string, customer_group: string]

In [0]:
df2 = (
    df_base
    .withColumn("test_column_1", F.lit("test_column_1"))
    .withColumn("birth_year", F.split("birthday", "/").getItem(2))
)

df1.show(5, False)

+----------+--------------+---+------+---------+-----+------+--------------+-------------+----------+
|cust_id   |name          |age|gender|birthday |zip  |city  |customer_group|test_column_1|birth_year|
+----------+--------------+---+------+---------+-----+------+--------------+-------------+----------+
|C007YEYTX9|Aaron Abbott  |34 |Female|7/13/1991|97823|boston|mid           |test_column_1|1991      |
|C08XAQUY73|Aaron Lambert |54 |Female|11/5/1966|75218|boston|old           |test_column_1|1966      |
|C094P1VXF9|Aaron Lindsey |24 |Male  |9/21/1990|29399|boston|young         |test_column_1|1990      |
|C097SHE1EF|Aaron Lopez   |22 |Female|4/18/2001|82129|boston|young         |test_column_1|2001      |
|C0DTC6436T|Aaron Schwartz|52 |Female|7/9/1962 |57192|boston|old           |test_column_1|1962      |
+----------+--------------+---+------+---------+-----+------+--------------+-------------+----------+
only showing top 5 rows

