In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [0]:
dbutils.library.restartPython()

In [0]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [0]:
transactions_file = "dbfs:/FileStore/transaction"
df_transactions = spark.read.parquet(transactions_file)
df_transactions.show(5)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|   cust_id|start_date|  end_date|         txn_id|      date|year|month|day| expense_type|   amt|       city|
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|C0YDPQWPBJ|2010-07-01|2018-12-01|TZ5SMKZY9S03OQJ|2018-10-07|2018|   10|  7|Entertainment| 10.42|     boston|
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYIAPPNU066CJ5R|2016-03-27|2016|    3| 27| Motor/Travel| 44.34|   portland|
|C0YDPQWPBJ|2010-07-01|2018-12-01|TETSXIK4BLXHJ6W|2011-04-11|2011|    4| 11|Entertainment|  3.18|    chicago|
|C0YDPQWPBJ|2010-07-01|2018-12-01|TQKL1QFJY3EM8LO|2018-02-22|2018|    2| 22|    Groceries|268.97|los_angeles|
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYL6DFP09PPXMVB|2010-10-16|2010|   10| 16|Entertainment|  2.66|    chicago|
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
only showi

In [0]:
customers_file = "dbfs:/FileStore/customer"
df_customers = spark.read.parquet(customers_file)
df_customers.show(5)

+----------+-------------+---+------+----------+-----+-----------+
|   cust_id|         name|age|gender|  birthday|  zip|       city|
+----------+-------------+---+------+----------+-----+-----------+
|C007YEYTX9| Aaron Abbott| 34|Female| 7/13/1991|97823|     boston|
|C00B971T1J| Aaron Austin| 37|Female|12/16/2004|30332|    chicago|
|C00WRSJF1Q| Aaron Barnes| 29|Female| 3/11/1977|23451|     denver|
|C01AZWQMF3|Aaron Barrett| 31|  Male|  7/9/1998|46613|los_angeles|
|C01BKUFRHA| Aaron Becker| 54|  Male|11/24/1979|40284|  san_diego|
+----------+-------------+---+------+----------+-----+-----------+
only showing top 5 rows



## **Narrow-Transformation**


In [0]:
#Narrow transformation

df_narrow_transform = (
    df_customers
    .filter(F.col("city") == "boston")
    .withColumn("first_name", F.split("name", " ").getItem(0))
    .withColumn("last_name", F.split("name", " ").getItem(1))
    .withColumn("age", F.col("age") + F.lit(5))
    .select("cust_id", "first_name", "last_name", "age", "gender", "birthday")
)

df_narrow_transform.show(5, False)
df_narrow_transform.explain(True)

+----------+----------+---------+----+------+---------+
|cust_id   |first_name|last_name|age |gender|birthday |
+----------+----------+---------+----+------+---------+
|C007YEYTX9|Aaron     |Abbott   |39.0|Female|7/13/1991|
|C08XAQUY73|Aaron     |Lambert  |59.0|Female|11/5/1966|
|C094P1VXF9|Aaron     |Lindsey  |29.0|Male  |9/21/1990|
|C097SHE1EF|Aaron     |Lopez    |27.0|Female|4/18/2001|
|C0DTC6436T|Aaron     |Schwartz |57.0|Female|7/9/1962 |
+----------+----------+---------+----+------+---------+
only showing top 5 rows

== Parsed Logical Plan ==
'Project ['cust_id, 'first_name, 'last_name, 'age, 'gender, 'birthday]
+- Project [cust_id#1037, name#1038, (cast(age#1039 as double) + cast(5 as double)) AS age#1114, gender#1040, birthday#1041, zip#1042, city#1043, first_name#1095, last_name#1104]
   +- Project [cust_id#1037, name#1038, age#1039, gender#1040, birthday#1041, zip#1042, city#1043, first_name#1095, split(name#1038,  , -1)[1] AS last_name#1104]
      +- Project [cust_id#1037, n

In [0]:
df_narrow_transform.write.format('noop').mode('overwrite').save('dbfs:/FileStore/dummy')

In [0]:
df_narrow_transform.show(5, False)

## **Wide-Transformation**

### Repartition

In [0]:
df_transactions.rdd.getNumPartitions()

Out[8]: 12

In [0]:
df_transactions.show(4)

In [0]:
df_customers.rdd.getNumPartitions()

Out[9]: 1

In [0]:
#display(df_transactions.withColumn('partitionId',F.spark_partition_id()).groupBy('partitionId').count())

In [0]:
df_transactions.write.format('noop').mode('overwrite').save('dbfs:/FileStore/dummy')

In [0]:
df_transactions.repartition(8).explain(True)

### Coalesce

In [0]:
df_transactions.coalesce(2).explain(True)

### Joins

In [0]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

In [0]:
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [0]:
df_joined = (
    df_transactions.join(
        df_customers,
        how="inner",
        on="cust_id"
    )
)

df_joined.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner,Buffer(cust_id))
:- Relation [cust_id#1467,start_date#1468,end_date#1469,txn_id#1470,date#1471,year#1472,month#1473,day#1474,expense_type#1475,amt#1476,city#1477] parquet
+- Relation [cust_id#1037,name#1038,age#1039,gender#1040,birthday#1041,zip#1042,city#1043] parquet

== Analyzed Logical Plan ==
cust_id: string, start_date: string, end_date: string, txn_id: string, date: string, year: string, month: string, day: string, expense_type: string, amt: string, city: string, name: string, age: string, gender: string, birthday: string, zip: string, city: string
Project [cust_id#1467, start_date#1468, end_date#1469, txn_id#1470, date#1471, year#1472, month#1473, day#1474, expense_type#1475, amt#1476, city#1477, name#1038, age#1039, gender#1040, birthday#1041, zip#1042, city#1043]
+- Join Inner, (cust_id#1467 = cust_id#1037)
   :- Relation [cust_id#1467,start_date#1468,end_date#1469,txn_id#1470,date#1471,year#1472,month#1473,day#1474,expense_type

In [0]:
df_joined.write.format('noop').mode('overwrite').save('dbfs:/FileStore')

### Group by

In [0]:

    df_city_counts = (
    df_transactions
    .groupBy("city")
    .count()
)


df_city_counts.explain(True)