## withColumn Vs withColumns - Not much of performance difference

### Multiple withColumn

In [0]:
from pyspark.sql.functions import *

df_1 = spark.sql("""select * from samples.tpch.customer""")

# Creating a list of list of columns that need to be instantiated as None
dummy_col_list = ['foo1', 'foo2', 'foo3', 'foo4', 'foo5']

# Using a for-loop to add these columns in dataframe
for col_name in dummy_col_list:
  df_1 = df_1.withColumn(col_name, lit(None).cast('string'))

In [0]:
df_1.explain("extended")

== Parsed Logical Plan ==
Project [c_custkey#101661L, c_name#101662, c_address#101663, c_nationkey#101664L, c_phone#101665, c_acctbal#101666, c_mktsegment#101667, c_comment#101668, foo1#101678, foo2#101688, foo3#101699, foo4#101711, cast(null as string) AS foo5#101724]
+- Project [c_custkey#101661L, c_name#101662, c_address#101663, c_nationkey#101664L, c_phone#101665, c_acctbal#101666, c_mktsegment#101667, c_comment#101668, foo1#101678, foo2#101688, foo3#101699, cast(null as string) AS foo4#101711]
   +- Project [c_custkey#101661L, c_name#101662, c_address#101663, c_nationkey#101664L, c_phone#101665, c_acctbal#101666, c_mktsegment#101667, c_comment#101668, foo1#101678, foo2#101688, cast(null as string) AS foo3#101699]
      +- Project [c_custkey#101661L, c_name#101662, c_address#101663, c_nationkey#101664L, c_phone#101665, c_acctbal#101666, c_mktsegment#101667, c_comment#101668, foo1#101678, cast(null as string) AS foo2#101688]
         +- Project [c_custkey#101661L, c_name#101662, c_a

In [0]:
df_1.printSchema()

root
 |-- c_custkey: long (nullable = true)
 |-- c_name: string (nullable = true)
 |-- c_address: string (nullable = true)
 |-- c_nationkey: long (nullable = true)
 |-- c_phone: string (nullable = true)
 |-- c_acctbal: decimal(18,2) (nullable = true)
 |-- c_mktsegment: string (nullable = true)
 |-- c_comment: string (nullable = true)
 |-- foo1: string (nullable = true)
 |-- foo2: string (nullable = true)
 |-- foo3: string (nullable = true)
 |-- foo4: string (nullable = true)
 |-- foo5: string (nullable = true)



In [0]:
from pyspark.sql.functions import *

df_2 = spark.sql("""select * from samples.tpch.customer""")

# Creating a list of list of columns that need to be instantiated as None
dummy_col_list = ['foo1', 'foo2', 'foo3', 'foo4', 'foo5']

# Using a for-loop to add these columns in dataframe
for col_name in dummy_col_list:
  df_3 = df_2.withColumn(col_name, lit(None).cast('string'))

In [0]:
df_3.explain("extended")

== Parsed Logical Plan ==
Project [c_custkey#101747L, c_name#101748, c_address#101749, c_nationkey#101750L, c_phone#101751, c_acctbal#101752, c_mktsegment#101753, c_comment#101754, cast(null as string) AS foo5#101804]
+- Project [c_custkey#101747L, c_name#101748, c_address#101749, c_nationkey#101750L, c_phone#101751, c_acctbal#101752, c_mktsegment#101753, c_comment#101754]
   +- SubqueryAlias samples.tpch.customer
      +- Relation samples.tpch.customer[c_custkey#101747L,c_name#101748,c_address#101749,c_nationkey#101750L,c_phone#101751,c_acctbal#101752,c_mktsegment#101753,c_comment#101754] parquet

== Analyzed Logical Plan ==
c_custkey: bigint, c_name: string, c_address: string, c_nationkey: bigint, c_phone: string, c_acctbal: decimal(18,2), c_mktsegment: string, c_comment: string, foo5: string
Project [c_custkey#101747L, c_name#101748, c_address#101749, c_nationkey#101750L, c_phone#101751, c_acctbal#101752, c_mktsegment#101753, c_comment#101754, cast(null as string) AS foo5#101804]
+-

In [0]:
df_4 = spark.sql("""select * from samples.tpch.customer""")

df_4 = df_4.withColumn('foo1', lit(None).cast('string'))\
       .withColumn('foo2', lit(None).cast('string'))\
       .withColumn('foo3', lit(None).cast('string'))\
       .withColumn('foo4', lit(None).cast('string'))\
       .withColumn('foo5', lit(None).cast('string'))

df_4.explain("extended")

== Parsed Logical Plan ==
Project [c_custkey#101995L, c_name#101996, c_address#101997, c_nationkey#101998L, c_phone#101999, c_acctbal#102000, c_mktsegment#102001, c_comment#102002, foo1#102012, foo2#102022, foo3#102033, foo4#102045, cast(null as string) AS foo5#102058]
+- Project [c_custkey#101995L, c_name#101996, c_address#101997, c_nationkey#101998L, c_phone#101999, c_acctbal#102000, c_mktsegment#102001, c_comment#102002, foo1#102012, foo2#102022, foo3#102033, cast(null as string) AS foo4#102045]
   +- Project [c_custkey#101995L, c_name#101996, c_address#101997, c_nationkey#101998L, c_phone#101999, c_acctbal#102000, c_mktsegment#102001, c_comment#102002, foo1#102012, foo2#102022, cast(null as string) AS foo3#102033]
      +- Project [c_custkey#101995L, c_name#101996, c_address#101997, c_nationkey#101998L, c_phone#101999, c_acctbal#102000, c_mktsegment#102001, c_comment#102002, foo1#102012, cast(null as string) AS foo2#102022]
         +- Project [c_custkey#101995L, c_name#101996, c_a

We can see multiple projects for every column added in analyzed logical plan

### Using withColumns

In [0]:
df_5 = spark.sql("""select * from samples.tpch.customer""")

# Creating a dictionary for the static columns
dummy_col_val_map = {
  'foo1': lit(None).cast('string'), 
  'foo2': lit(None).cast('string'), 
  'foo3': lit(None).cast('string'), 
  'foo4': lit(None).cast('string'), 
  'foo5': lit(None).cast('string')
}

# Adding columns using withColumns
df_5 = df_5.withColumns(dummy_col_val_map)

df_5.explain("extended")

== Parsed Logical Plan ==
Project [c_custkey#102169L, c_name#102170, c_address#102171, c_nationkey#102172L, c_phone#102173, c_acctbal#102174, c_mktsegment#102175, c_comment#102176, cast(null as string) AS foo1#102186, cast(null as string) AS foo2#102187, cast(null as string) AS foo3#102188, cast(null as string) AS foo4#102189, cast(null as string) AS foo5#102190]
+- Project [c_custkey#102169L, c_name#102170, c_address#102171, c_nationkey#102172L, c_phone#102173, c_acctbal#102174, c_mktsegment#102175, c_comment#102176]
   +- SubqueryAlias samples.tpch.customer
      +- Relation samples.tpch.customer[c_custkey#102169L,c_name#102170,c_address#102171,c_nationkey#102172L,c_phone#102173,c_acctbal#102174,c_mktsegment#102175,c_comment#102176] parquet

== Analyzed Logical Plan ==
c_custkey: bigint, c_name: string, c_address: string, c_nationkey: bigint, c_phone: string, c_acctbal: decimal(18,2), c_mktsegment: string, c_comment: string, foo1: string, foo2: string, foo3: string, foo4: string, foo

Single project node in analyzed logical plan when withColumns is used

### Using .select() with an alias

In [0]:
df_1 = spark.sql("""select * from samples.tpch.customer""")

# Using .select with alias
df_1 = df_1.select("*", *[cvalue.alias(cname) for cname, cvalue in dummy_col_val_map.items()])

df_1.explain("extended")

== Parsed Logical Plan ==
'Project [*, cast(null as string) AS foo1#1207, cast(null as string) AS foo2#1208, cast(null as string) AS foo3#1209, cast(null as string) AS foo4#1210, cast(null as string) AS foo5#1211]
+- Project [c_custkey#1190L, c_name#1191, c_address#1192, c_nationkey#1193L, c_phone#1194, c_acctbal#1195, c_mktsegment#1196, c_comment#1197]
   +- SubqueryAlias samples.tpch.customer
      +- Relation samples.tpch.customer[c_custkey#1190L,c_name#1191,c_address#1192,c_nationkey#1193L,c_phone#1194,c_acctbal#1195,c_mktsegment#1196,c_comment#1197] parquet

== Analyzed Logical Plan ==
c_custkey: bigint, c_name: string, c_address: string, c_nationkey: bigint, c_phone: string, c_acctbal: decimal(18,2), c_mktsegment: string, c_comment: string, foo1: string, foo2: string, foo3: string, foo4: string, foo5: string
Project [c_custkey#1190L, c_name#1191, c_address#1192, c_nationkey#1193L, c_phone#1194, c_acctbal#1195, c_mktsegment#1196, c_comment#1197, cast(null as string) AS foo1#1207, 

In [0]:
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [0]:
# Checking time taken by rule executor for withColumn
from pyspark.sql.functions import *

df_1 = spark.sql("""select * from samples.tpch.customer""")

# lets create 300 dummy column list
dummy_col_list = [f"foo{i}" for i in range(1,1001)]

# Get JVM reference
jvm = spark.sparkContext._jvm

# Access Scala package/class
catalyst_rule_executor = jvm.org.apache.spark.sql.catalyst.rules.RuleExecutor

# Adding 300 columns
for col_name in dummy_col_list:
  df_2 = df_1.withColumn(col_name, lit(None).cast('string'))

print(catalyst_rule_executor.dumpTimeSpent())



=== Metrics of Analyzer/Optimizer Rules ===
Total number of runs: 366688
Total time: 146.009936957 seconds

Rule                                                                                               Effective Time / Total Time                     Effective Runs / Total Runs                    

org.apache.spark.sql.catalyst.analysis.Analyzer$AddMetadataColumns                                 0 / 63458891256                                 0 / 4025                                       
org.apache.spark.sql.catalyst.plans.logical.ConvertSecureViewUnaryNodeToLeafNode                   0 / 33635693047                                 0 / 2014                                       
org.apache.spark.sql.catalyst.analysis.DeduplicateRelations                                        0 / 30267194693                                 0 / 4025                                       
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations                                   10551620626

In [0]:
# Checking time taken by rule executor for withColumns
from pyspark.sql.functions import *
 
df_1 = spark.sql("""select * from samples.tpch.customer""")

# lets create 300 dummy column list
dummy_col_list = [f"foo{i}" for i in range(1,1001)]

dummy_col_val_map = {cname: lit(None).cast('string') for cname in dummy_col_list}

# Get JVM reference
jvm = spark.sparkContext._jvm

# Access Scala package/class
catalyst_rule_executor = jvm.org.apache.spark.sql.catalyst.rules.RuleExecutor

# Adding 300 columns
df_2 = df_1.withColumns(dummy_col_val_map)

print(catalyst_rule_executor.dumpTimeSpent())


=== Metrics of Analyzer/Optimizer Rules ===
Total number of runs: 184506
Total time: 145.094955828 seconds

Rule                                                                                               Effective Time / Total Time                     Effective Runs / Total Runs                    

org.apache.spark.sql.catalyst.analysis.Analyzer$AddMetadataColumns                                 0 / 63381162118                                 0 / 2023                                       
org.apache.spark.sql.catalyst.plans.logical.ConvertSecureViewUnaryNodeToLeafNode                   0 / 33623253777                                 0 / 1013                                       
org.apache.spark.sql.catalyst.analysis.DeduplicateRelations                                        0 / 30235319019                                 0 / 2023                                       
org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations                                   10349759220