##### How to add columns using dictionary?

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import lit

In [0]:
df_ts = spark.read.csv("/Volumes/@azureadb/pyspark/timestamp/timestamptodate.csv", header=True, inferSchema=True)
display(df_ts)

start_date,product_url,category,default_group,cloud_flatform,session_id,session_name,session_type,sessions,product_id
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876543,first_visit,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876544,purchase,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876545,search,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876546,search,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876547,search,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876548,add_to_cart,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876549,add_to_cart,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876550,add_to_cart,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876551,add_to_cart,Not Available,1,409516064
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876552,add_to_cart,Not Available,1,409516064


**1) Add columns from a dictionary of literal values**
- If your **dictionary** has **column names and constant values**.

     # Method 01
     for col, val in col_dict.items():
         df = df.withColumn(col, F.lit(val))

     # Method 02
     df_ts_adv = df_ts.withColumn("productivity", F.lit("advertisement")) \
                      .withColumn("Sales_ID", F.lit("NULL")) \
                      .withColumn("Sales_Name", F.lit("NULL")) \
                      .withColumn("Granularity", F.lit("product_category"))

In [0]:
def add_col(df, dic_name_value):
    for col_name, col_value in dic_name_value.items():
        df = df.withColumn(col_name, F.lit(col_value))
    return df

In [0]:
dic_add_cols = {"productivity": "advertisement", "Sales": "NULL", "Sales_Name": "NULL", "Granularity": "product_category", "country": "India", "status": "Active"}

df_ts_adv = add_col(df_ts, dic_add_cols)
display(df_ts_adv.limit(25))

start_date,product_url,category,default_group,cloud_flatform,session_id,session_name,session_type,sessions,product_id,productivity,Sales,Sales_Name,Granularity,country,status
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876543,first_visit,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876544,purchase,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876545,search,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876546,search,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876547,search,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876548,add_to_cart,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876549,add_to_cart,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876550,add_to_cart,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876551,add_to_cart,Not Available,1,409516064,advertisement,,,product_category,India,Active
2025-08-25T00:00:00.000Z,shop.sony.bpl,mobile,wifi-network,azure / aws / gcc,9876552,add_to_cart,Not Available,1,409516064,advertisement,,,product_category,India,Active


**2) Add columns from a dictionary of expressions**
- If your dictionary maps new column names to transformations

In [0]:
# Example DataFrame
df_exp = spark.createDataFrame([(1, "Roja", 3),
                                (2, "Bibin", 4),
                                (3, "Rajesh", 5),
                                (4, "Priya", 6),
                                (5, "Mohan", 9),],
                               ["id", "name", "sales"])

# Dictionary: new column names -> constant values
col_dict = {"country": "India", "status": "Active"}

df_dict = df_exp
for col, val in col_dict.items():
    df_dict = df_dict.withColumn(col, F.lit(val))

display(df_dict)

id,name,sales,country,status
1,Roja,3,India,Active
2,Bibin,4,India,Active
3,Rajesh,5,India,Active
4,Priya,6,India,Active
5,Mohan,9,India,Active


In [0]:
expr_dict = {
    "name_upper": F.upper(F.col("name")),
    "sales_squared": (F.col("id") ** 2),
    "id_plus_10": F.col("id") + 10,
    "constant_val": F.lit(50),
    "name_length": F.length(F.col("name"))
}

df_dict_expr = df_exp

for col, expr in expr_dict.items():
    df_dict_expr = df_dict_expr.withColumn(col, expr)

display(df_dict_expr)

id,name,sales,name_upper,sales_squared,id_plus_10,constant_val,name_length
1,Roja,3,ROJA,1.0,11,50,4
2,Bibin,4,BIBIN,4.0,12,50,5
3,Rajesh,5,RAJESH,9.0,13,50,6
4,Priya,6,PRIYA,16.0,14,50,5
5,Mohan,9,MOHAN,25.0,15,50,5


**3) Add columns from a dictionary of different datatypes**

In [0]:
col_dict = {"is_valid": True, "score": 95.5}

df_dtypes = df_dict_expr
for col, val in col_dict.items():
    df_dtypes = df_dtypes.withColumn(col, F.lit(val))

display(df_dtypes)
df_dtypes.printSchema()

id,name,sales,name_upper,sales_squared,id_plus_10,is_valid,score
1,Roja,3,ROJA,1.0,11,True,95.5
2,Bibin,4,BIBIN,4.0,12,True,95.5
3,Rajesh,5,RAJESH,9.0,13,True,95.5
4,Priya,6,PRIYA,16.0,14,True,95.5
5,Mohan,9,MOHAN,25.0,15,True,95.5


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- sales: long (nullable = true)
 |-- name_upper: string (nullable = true)
 |-- sales_squared: double (nullable = true)
 |-- id_plus_10: long (nullable = true)
 |-- is_valid: boolean (nullable = false)
 |-- score: double (nullable = false)



**4) Adding Multiple New Columns from a Dictionary**

In [0]:
# Dictionary containing new column names and their literal values
new_columns_dict = {"city": "New York", "country": "USA"}

# Add multiple columns using withColumns()
df_with_new_cols = df.withColumns(
    {col_name: lit(value) for col_name, value in new_columns_dict.items()}
)

display(df_with_new_cols)

id,name,sales,country,status,city
1,Roja,3,USA,Active,New York
2,Bibin,4,USA,Active,New York
3,Rajesh,5,USA,Active,New York
4,Priya,6,USA,Active,New York
5,Mohan,9,USA,Active,New York


**5) Using select + dictionary unpacking (one-shot)**

In [0]:
df_dict = df.select("*", *[F.lit(v).alias(k) for k, v in col_dict.items()])
display(df_dict)

id,name,sales,country,status,is_valid,score
1,Roja,3,India,Active,True,95.5
2,Bibin,4,India,Active,True,95.5
3,Rajesh,5,India,Active,True,95.5
4,Priya,6,India,Active,True,95.5
5,Mohan,9,India,Active,True,95.5


      [F.lit(v).alias(k) for k, v in col_dict.items()]

- Loops through each **key, value** pair in the dictionary **col_dict**.
- **F.lit(v)** → creates a literal column (constant value for every row).
- **.alias(k)** → names that new column with the dictionary key.

     col_dict = {"is_valid": True, "score": 95.5}
     [F.lit("True").alias("is_valid"), F.lit(95.5).alias("score")]
     
     col_dict = {"country": "India", "status": "Active"}
     [F.lit("India").alias("country"), F.lit("Active").alias("status")]

- The * before the **list**
  - **Unpacks the list** so each element is passed as a separate argument to select.
  - Equivalent to writing:

        df.select("*", F.lit("India").alias("country"), F.lit("Active").alias("status"))