#### How to use unionByName to join dataframes by column names?

#### unionByName()

- **union()** requires both DataFrames to have the **same schema** in the **same order**.
- **unionByName()** allows unioning by **matching column names** instead of relying on **order**.
- The **unionByName()** function in PySpark is used to **combine two or more DataFrames** based on their **column names**, rather than their **positional order**.
- This is a key **distinction** from the standard **union() or unionAll()** methods, which require the DataFrames to have **identical schemas** in terms of **both column names and order**.

In [0]:
from pyspark.sql.functions import lit, col, sum

In [0]:
bronze_tbl_01 = spark.read.csv("/Volumes/@azureadb/pyspark/unionby/company_level.csv", header=True, inferSchema=True)
bronze_tbl = bronze_tbl_01.withColumn("session_id", col("session_id").cast("string"))

print("Column Names of Bronze Table 1: \n", bronze_tbl.columns)
print("\nNo of Columns in Bronze Table 1: ", len(bronze_tbl.columns))
print("\nTotal Rows in Bronze Table 01: ", bronze_tbl.count())

display(bronze_tbl)
# display(bronze_tbl.limit(10))

Column Names of Bronze Table 1: 
 ['start_date', 'product_url', 'category', 'default_group', 'source_target', 'cloud_flatform', 'session_id', 'session_name', 'status_name', 'status_type', 'sessions', 'product_id', 'load datetime']

No of Columns in Bronze Table 1:  13

Total Rows in Bronze Table 01:  100


start_date,product_url,category,default_group,source_target,cloud_flatform,session_id,session_name,status_name,status_type,sessions,product_id,load datetime
2025-08-25,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876543,first_visit,first_visit,Not Available,5,409516064,2025-09-02T19:10:35
2025-08-26,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876544,purchase,organic,Not Available,12,409516064,2025-09-02T19:10:36
2025-08-27,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876545,search,network,Not Available,16,409516064,2025-09-02T19:10:37
2025-08-28,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876546,search,scroll,Not Available,22,409516064,2025-09-02T19:10:38
2025-08-29,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876547,search,organic,Not Available,25,409516064,2025-09-02T19:10:39
2025-08-30,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,9876548,add_to_cart,organic,Not Available,4,409516064,2025-09-02T19:10:40
2025-08-31,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,9876549,add_to_cart,organic,Not Available,9,409516064,2025-09-02T19:10:41
2025-09-01,shop.sony.bpl,mobile,wifi-network,(none) / (direct),azure / aws / gcc,9876550,add_to_cart,first_visit,Not Available,8,409516064,2025-09-02T19:10:42
2025-09-02,shop.sony.bpl,mobile,wifi-network,flipkart / referral,azure / aws / gcc,9876551,add_to_cart,first_visit,Not Available,7,409516064,2025-09-02T19:10:43
2025-09-03,shop.sony.bpl,mobile,wifi-network,(data not available),azure / aws / gcc,9876552,add_to_cart,first_visit,Not Available,6,409516064,2025-09-02T19:10:44


In [0]:
silver_tbl = spark.read.csv("/Volumes/@azureadb/pyspark/unionby/device_level.csv", header=True, inferSchema=True)

print("Column Names of Silver Table 2: \n", silver_tbl.columns)
print("\nNo of Columns in Silver Table 2: ", len(silver_tbl.columns))
print("\nTotal Rows in Silver Table 02: ", silver_tbl.count())

display(silver_tbl)
# display(silver_tbl.limit(10))

Column Names of Silver Table 2: 
 ['start_date', 'product_url', 'category', 'default_group', 'source_target', 'cloud_flatform', 'status_name', 'status_type', 'sessions', 'product_id', 'load datetime']

No of Columns in Silver Table 2:  11

Total Rows in Silver Table 02:  100


start_date,product_url,category,default_group,source_target,cloud_flatform,status_name,status_type,sessions,product_id,load datetime
2025-08-25,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,first_visit,Not Available,55,409516064,2025-09-02T19:10:35
2025-08-26,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,organic,Not Available,12,409516064,2025-09-02T19:10:36
2025-08-27,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,network,Not Available,16,409516064,2025-09-02T19:10:37
2025-08-28,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,scroll,Not Available,22,409516064,2025-09-02T19:10:38
2025-08-29,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,organic,Not Available,25,409516064,2025-09-02T19:10:39
2025-08-30,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,organic,Not Available,41,409516064,2025-09-02T19:10:40
2025-08-31,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,organic,Not Available,91,409516064,2025-09-02T19:10:41
2025-09-01,shop.sony.bpl,mobile,wifi-network,(none) / (direct),azure / aws / gcc,first_visit,Not Available,28,409516064,2025-09-02T19:10:42
2025-09-02,shop.sony.bpl,mobile,wifi-network,flipkart / referral,azure / aws / gcc,first_visit,Not Available,17,409516064,2025-09-02T19:10:43
2025-09-03,shop.sony.bpl,mobile,wifi-network,(data not available),azure / aws / gcc,first_visit,Not Available,16,409516064,2025-09-02T19:10:44


In [0]:
diff_cols_bronze_silver = set(bronze_tbl.columns) - set(silver_tbl.columns)
print("Columns in Silver Table 1 but not in Silver Table 2: ", diff_cols_bronze_silver)

Columns in Silver Table 1 but not in Silver Table 2:  {'session_name', 'session_id'}


In [0]:
# step:1 adding GRANULARITY column to identify the level of dataframe
df_bronze_tbl = bronze_tbl.withColumn("GRANULARITY", lit("campaign"))
display(df_bronze_tbl)

start_date,product_url,category,default_group,source_target,cloud_flatform,session_id,session_name,status_name,status_type,sessions,product_id,load datetime,GRANULARITY
2025-08-25,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876543,first_visit,first_visit,Not Available,5,409516064,2025-09-02T19:10:35,campaign
2025-08-26,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876544,purchase,organic,Not Available,12,409516064,2025-09-02T19:10:36,campaign
2025-08-27,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876545,search,network,Not Available,16,409516064,2025-09-02T19:10:37,campaign
2025-08-28,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876546,search,scroll,Not Available,22,409516064,2025-09-02T19:10:38,campaign
2025-08-29,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876547,search,organic,Not Available,25,409516064,2025-09-02T19:10:39,campaign
2025-08-30,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,9876548,add_to_cart,organic,Not Available,4,409516064,2025-09-02T19:10:40,campaign
2025-08-31,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,9876549,add_to_cart,organic,Not Available,9,409516064,2025-09-02T19:10:41,campaign
2025-09-01,shop.sony.bpl,mobile,wifi-network,(none) / (direct),azure / aws / gcc,9876550,add_to_cart,first_visit,Not Available,8,409516064,2025-09-02T19:10:42,campaign
2025-09-02,shop.sony.bpl,mobile,wifi-network,flipkart / referral,azure / aws / gcc,9876551,add_to_cart,first_visit,Not Available,7,409516064,2025-09-02T19:10:43,campaign
2025-09-03,shop.sony.bpl,mobile,wifi-network,(data not available),azure / aws / gcc,9876552,add_to_cart,first_visit,Not Available,6,409516064,2025-09-02T19:10:44,campaign


In [0]:
# step:2 adding GRANULARITY column to identify the level of dataframe
df_silver_tbl = silver_tbl \
    .withColumn("session_id", lit("NULL")) \
    .withColumn("session_name", lit("NULL")) \
    .withColumn("GRANULARITY", lit("device_category"))

display(df_silver_tbl)

start_date,product_url,category,default_group,source_target,cloud_flatform,status_name,status_type,sessions,product_id,load datetime,session_id,session_name,GRANULARITY
2025-08-25,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,first_visit,Not Available,55,409516064,2025-09-02T19:10:35,,,device_category
2025-08-26,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,organic,Not Available,12,409516064,2025-09-02T19:10:36,,,device_category
2025-08-27,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,network,Not Available,16,409516064,2025-09-02T19:10:37,,,device_category
2025-08-28,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,scroll,Not Available,22,409516064,2025-09-02T19:10:38,,,device_category
2025-08-29,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,organic,Not Available,25,409516064,2025-09-02T19:10:39,,,device_category
2025-08-30,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,organic,Not Available,41,409516064,2025-09-02T19:10:40,,,device_category
2025-08-31,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,organic,Not Available,91,409516064,2025-09-02T19:10:41,,,device_category
2025-09-01,shop.sony.bpl,mobile,wifi-network,(none) / (direct),azure / aws / gcc,first_visit,Not Available,28,409516064,2025-09-02T19:10:42,,,device_category
2025-09-02,shop.sony.bpl,mobile,wifi-network,flipkart / referral,azure / aws / gcc,first_visit,Not Available,17,409516064,2025-09-02T19:10:43,,,device_category
2025-09-03,shop.sony.bpl,mobile,wifi-network,(data not available),azure / aws / gcc,first_visit,Not Available,16,409516064,2025-09-02T19:10:44,,,device_category


In [0]:
# step:3 union of bronze & silver dataframes
df_gold_tbl = df_bronze_tbl.unionByName(df_silver_tbl)
display(df_gold_tbl)

start_date,product_url,category,default_group,source_target,cloud_flatform,session_id,session_name,status_name,status_type,sessions,product_id,load datetime,GRANULARITY
2025-08-25,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876543.0,first_visit,first_visit,Not Available,5,409516064,2025-09-02T19:10:35,campaign
2025-08-26,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876544.0,purchase,organic,Not Available,12,409516064,2025-09-02T19:10:36,campaign
2025-08-27,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876545.0,search,network,Not Available,16,409516064,2025-09-02T19:10:37,campaign
2025-08-28,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876546.0,search,scroll,Not Available,22,409516064,2025-09-02T19:10:38,campaign
2025-08-29,shop.sony.bpl,mobile,wifi-network,google,azure / aws / gcc,9876547.0,search,organic,Not Available,25,409516064,2025-09-02T19:10:39,campaign
2025-08-30,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,9876548.0,add_to_cart,organic,Not Available,4,409516064,2025-09-02T19:10:40,campaign
2025-08-31,shop.sony.bpl,mobile,wifi-network,(not set),azure / aws / gcc,9876549.0,add_to_cart,organic,Not Available,9,409516064,2025-09-02T19:10:41,campaign
2025-09-01,shop.sony.bpl,mobile,wifi-network,(none) / (direct),azure / aws / gcc,9876550.0,add_to_cart,first_visit,Not Available,8,409516064,2025-09-02T19:10:42,campaign
2025-09-02,shop.sony.bpl,mobile,wifi-network,flipkart / referral,azure / aws / gcc,9876551.0,add_to_cart,first_visit,Not Available,7,409516064,2025-09-02T19:10:43,campaign
2025-09-03,shop.sony.bpl,mobile,wifi-network,(data not available),azure / aws / gcc,9876552.0,add_to_cart,first_visit,Not Available,6,409516064,2025-09-02T19:10:44,campaign


In [0]:
data = [
    ("Bronze Table", [row["GRANULARITY"] for row in df_bronze_tbl.select("GRANULARITY").distinct().collect()]),
    ("Silver Table", [row["GRANULARITY"] for row in df_silver_tbl.select("GRANULARITY").distinct().collect()]),
    ("Gold Table", [row["GRANULARITY"] for row in df_gold_tbl.select("GRANULARITY").distinct().collect()])
]

df_gran = spark.createDataFrame(data, ["Table", "Distinct GRANULARITY"])
display(df_gran)

Table,Distinct GRANULARITY
Bronze Table,List(campaign)
Silver Table,List(device_category)
Gold Table,"List(campaign, device_category)"


**.collect()**
- This collects **all the rows** from the DataFrame into a **Python list**.

In [0]:
data = [
    ("Bronze Table", [row["session_id"] for row in df_bronze_tbl.select("session_id").distinct().collect()]),
    ("Silver Table", [row["session_id"] for row in df_silver_tbl.select("session_id").distinct().collect()]),
    ("Gold Table", [row["session_id"] for row in df_gold_tbl.select("session_id").distinct().collect()])
]

df_sess_id = spark.createDataFrame(data, ["Table", "Distinct Sessions"])
display(df_sess_id)

Table,Distinct Sessions
Bronze Table,"List(9876568, 9876557, 9876592, 9876582, 9876567, 9876616, 9876569, 9876549, 9876597, 9876556, 9876619, 9876608, 9876632, 9876623, 9876609, 9876580, 9876560, 9876543, 9876585, 9876570, 9876550, 9876588, 9876573, 9876589, 9876575, 9876612, 9876545, 9876591, 9876598, 9876633, 9876565, 9876641, 9876578, 9876607, 9876602, 9876581, 9876642, 9876554, 9876551, 9876548, 9876600, 9876624, 9876590, 9876572, 9876627, 9876628, 9876563, 9876576, 9876615, 9876595, 9876629, 9876625, 9876613, 9876620, 9876561, 9876562, 9876626, 9876558, 9876618, 9876583, 9876546, 9876611, 9876559, 9876553, 9876634, 9876555, 9876599, 9876630, 9876605, 9876635, 9876594, 9876571, 9876621, 9876631, 9876639, 9876614, 9876544, 9876603, 9876601, 9876593, 9876584, 9876547, 9876617, 9876577, 9876622, 9876637, 9876586, 9876596, 9876587, 9876636, 9876606, 9876564, 9876638, 9876640, 9876552, 9876604, 9876579, 9876610, 9876574, 9876566)"
Silver Table,List(NULL)
Gold Table,"List(9876568, 9876557, 9876592, 9876582, 9876567, 9876616, 9876569, 9876549, 9876597, 9876556, 9876619, 9876608, 9876632, 9876623, 9876609, 9876580, 9876560, 9876543, 9876585, 9876570, 9876550, 9876588, 9876573, 9876589, 9876575, 9876612, 9876545, 9876591, 9876598, 9876633, 9876565, 9876641, 9876578, 9876607, 9876602, 9876581, 9876642, 9876554, 9876551, 9876548, 9876600, 9876624, 9876590, 9876572, 9876627, 9876628, 9876563, 9876576, 9876615, 9876595, 9876629, 9876625, 9876613, 9876620, 9876561, 9876562, 9876626, 9876558, 9876618, 9876583, 9876546, 9876611, 9876559, 9876553, 9876634, 9876555, 9876599, 9876630, 9876605, 9876635, 9876594, 9876571, 9876621, 9876631, 9876639, 9876614, 9876544, 9876603, 9876601, 9876593, 9876584, 9876547, 9876617, 9876577, 9876622, 9876637, 9876586, 9876596, 9876587, 9876636, 9876606, 9876564, 9876638, 9876640, 9876552, 9876604, 9876579, 9876610, 9876574, 9876566, NULL)"


In [0]:
data = [
    ("Bronze Table", [row["session_name"] for row in df_bronze_tbl.select("session_name").distinct().collect()]),
    ("Silver Table", [row["session_name"] for row in df_silver_tbl.select("session_name").distinct().collect()]),
    ("Gold Table", [row["session_name"] for row in df_gold_tbl.select("session_name").distinct().collect()])
]

df_sess_name = spark.createDataFrame(data, ["Table", "Distinct Session Name"])
display(df_sess_name)

Table,Distinct Session Name
Bronze Table,"List(add_to_cart, first_visit, purchase, click, search, session_start)"
Silver Table,List(NULL)
Gold Table,"List(add_to_cart, first_visit, purchase, click, search, session_start, NULL)"
