In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder \
    .appName("Read CSV example") \
    .getOrCreate()

In [3]:
data = [
    ("O001","Hyderabad","Electronics",1200,"Delivered"),
    ("O002","Delhi","Clothing",800,"Delivered"),
    ("O003","Mumbai","Electronics",1500,"Cancelled"),
    ("O004","Bangalore","Grocery",400,"Delivered"),
    ("O005","Hyderabad","Grocery",300,"Delivered"),
    ("O006","Delhi","Electronics",2000,"Delivered"),
    ("O007","Mumbai","Clothing",700,"Delivered"),
    ("O008","Bangalore","Electronics",1800,"Delivered"),
    ("O009","Delhi","Grocery",350,"Cancelled"),
    ("O010","Hyderabad","Clothing",900,"Delivered")
]
columns=["order_id","city","category","order_amount","status"]
df=spark.createDataFrame(data,columns)
df.show()
df.printSchema()

+--------+---------+-----------+------------+---------+
|order_id|     city|   category|order_amount|   status|
+--------+---------+-----------+------------+---------+
|    O001|Hyderabad|Electronics|        1200|Delivered|
|    O002|    Delhi|   Clothing|         800|Delivered|
|    O003|   Mumbai|Electronics|        1500|Cancelled|
|    O004|Bangalore|    Grocery|         400|Delivered|
|    O005|Hyderabad|    Grocery|         300|Delivered|
|    O006|    Delhi|Electronics|        2000|Delivered|
|    O007|   Mumbai|   Clothing|         700|Delivered|
|    O008|Bangalore|Electronics|        1800|Delivered|
|    O009|    Delhi|    Grocery|         350|Cancelled|
|    O010|Hyderabad|   Clothing|         900|Delivered|
+--------+---------+-----------+------------+---------+

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- order_amount: long (nullable = true)
 |-- status: string (nullable = true)



In [4]:
df.rdd.getNumPartitions()

2

In [5]:
df_repart=df.repartition(4)
df_repart.rdd.getNumPartitions()

4

In [6]:
df_coalesce=df_repart.coalesce(1)
df_coalesce.rdd.getNumPartitions()

1

In [7]:
df_lineage=(
    df.filter(df.status == "Delivered")
      .filter(df.order_amount > 500)
      .select("city","order_amount")
)


In [8]:
df_lineage.count()

6

In [9]:
filtered_df=df.filter(df.city=="Delhi")
selected_df=filtered_df.select("order_id","order_amount")


In [10]:
selected_df.show()

+--------+------------+
|order_id|order_amount|
+--------+------------+
|    O002|         800|
|    O006|        2000|
|    O009|         350|
+--------+------------+



In [11]:
df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Analyzed Logical Plan ==
order_id: string, city: string, category: string, order_amount: bigint, status: string
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Optimized Logical Plan ==
LogicalRDD [order_id#0, city#1, category#2, order_amount#3L, status#4], false

== Physical Plan ==
*(1) Scan ExistingRDD[order_id#0,city#1,category#2,order_amount#3L,status#4]



In [12]:
orders_data = [

    ("O001","Hyderabad",1200),

    ("O002","Delhi",800),

    ("O003","Mumbai",1500),

    ("O004","Bangalore",400),

    ("O005","Hyderabad",300),

    ("O006","Delhi",2000),

    ("O007","Mumbai",700),

    ("O008","Bangalore",1800),

    ("O009","Delhi",350),

    ("O010","Hyderabad",900)

]

orders_cols = ["order_id","city","order_amount"]

orders_df = spark.createDataFrame(orders_data, orders_cols)

orders_df.show()


city_data = [

    ("Hyderabad","Tier-1"),

    ("Delhi","Tier-1"),

    ("Mumbai","Tier-1"),

    ("Bangalore","Tier-1")

]

city_cols = ["city","city_category"]

city_df = spark.createDataFrame(city_data, city_cols)

city_df.show()

##################################################################################








+--------+---------+------------+
|order_id|     city|order_amount|
+--------+---------+------------+
|    O001|Hyderabad|        1200|
|    O002|    Delhi|         800|
|    O003|   Mumbai|        1500|
|    O004|Bangalore|         400|
|    O005|Hyderabad|         300|
|    O006|    Delhi|        2000|
|    O007|   Mumbai|         700|
|    O008|Bangalore|        1800|
|    O009|    Delhi|         350|
|    O010|Hyderabad|         900|
+--------+---------+------------+

+---------+-------------+
|     city|city_category|
+---------+-------------+
|Hyderabad|       Tier-1|
|    Delhi|       Tier-1|
|   Mumbai|       Tier-1|
|Bangalore|       Tier-1|
+---------+-------------+



In [14]:


from pyspark.sql.functions import col
filtered_orders = orders_df.filter(col("order_amount") > 500)

joined_df = filtered_orders.join(

    city_df,

    on="city",

    how="inner"

)

final_df = joined_df.select(

    "order_id",

    "city",

    "city_category",

    "order_amount"

)





In [15]:
final_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amount]
+- Project [city#35, order_id#34, order_amount#36L, city_category#48]
   +- Join Inner, (city#35 = city#47)
      :- Filter (order_amount#36L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#34, city#35, order_amount#36L], false
      +- LogicalRDD [city#47, city_category#48], false

== Analyzed Logical Plan ==
order_id: string, city: string, city_category: string, order_amount: bigint
Project [order_id#34, city#35, city_category#48, order_amount#36L]
+- Project [city#35, order_id#34, order_amount#36L, city_category#48]
   +- Join Inner, (city#35 = city#47)
      :- Filter (order_amount#36L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#34, city#35, order_amount#36L], false
      +- LogicalRDD [city#47, city_category#48], false

== Optimized Logical Plan ==
Project [order_id#34, city#35, city_category#48, order_amount#36L]
+- Join Inner, (city#35 = city#47)
   :- Filter ((isnotnull(orde

In [16]:
from typing_extensions import final
from pyspark.sql.functions import broadcast
broadcast_join_df = filtered_orders.join(

    broadcast(city_df),

    on="city",

    how="inner"

)
final_df = broadcast_join_df.select(

    "order_id",

    "city",

    "city_category",

    "order_amount"

)

In [18]:
final_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'city_category, 'order_amount]
+- Project [city#35, order_id#34, order_amount#36L, city_category#48]
   +- Join Inner, (city#35 = city#47)
      :- Filter (order_amount#36L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#34, city#35, order_amount#36L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#47, city_category#48], false

== Analyzed Logical Plan ==
order_id: string, city: string, city_category: string, order_amount: bigint
Project [order_id#34, city#35, city_category#48, order_amount#36L]
+- Project [city#35, order_id#34, order_amount#36L, city_category#48]
   +- Join Inner, (city#35 = city#47)
      :- Filter (order_amount#36L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#34, city#35, order_amount#36L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#47, city_category#48], false

== Optimized Logical Plan ==
Project [order_id#34, city#35, city_cat