#### **PySpark: lit()**

- The name **lit** stands for **literal**.
- **lit()** can be used with various **constant values, including strings, integers, floats, and booleans**.
- It enables you to **create a column** with a **constant value** that can be used for **various purposes**:
  - adding metadata
  - flagging specific rows
  - performing calculations based on a fixed value.

**Different usage of lit() function**

     1) Creating a column by constant value
     2) Concatenating Strings (two columns)
     3) Assigning default values

In [0]:
# Sample data
data = [(101, "Girish", "Kumar", "Bangalore", 26),
        (102, "Ramesh", "Rathod", "Chennai", 29),
        (103, "Somesh", "Sekar", "Hyderabad", 32),
        (104, "Pradeep", "Rao", "Pune", 25),
        (105, "Kamal", "Prathap", "Mumbai", 27),
        (106, "Kishore", "Kumar", "Nasik", 34),
        (107, "Hari", "Shetty", None, 35),
        (108, "Basha", "Azmath", None, 24)
       ]

columns = ["EmpId", "first_name", "last_name", "City", "Age"]

# Create DataFrame
df_emp = spark.createDataFrame(data, columns)
display(df_emp)

EmpId,first_name,last_name,City,Age
101,Girish,Kumar,Bangalore,26
102,Ramesh,Rathod,Chennai,29
103,Somesh,Sekar,Hyderabad,32
104,Pradeep,Rao,Pune,25
105,Kamal,Prathap,Mumbai,27
106,Kishore,Kumar,Nasik,34
107,Hari,Shetty,,35
108,Basha,Azmath,,24


In [0]:
from pyspark.sql.functions import lit, col, coalesce, concat

##### 1) Creating a column by constant value
- **lit()** can be used with various **constant values**, including:
  - strings
  - integers
  - floats
  - booleans

In [0]:
# Add constant & calculated columns using lit()
df_with_constant = (
    df_emp
    .withColumn("Country", lit("USA"))              # String constant
    .withColumn("DiscountRate", lit(5))             # Integer constant
    .withColumn("TaxRate", lit(18.5))               # Float constant
    .withColumn("InStock", lit(True))               # Boolean constant
    .withColumn(
        "TotalPriceWithTax",
        (col("Age") * lit(1000)) * (lit(1) + lit(0.18))  # Example calc using lit
    )
    .withColumn("DiscountStr", lit(5).cast("string"))   # Casted constant
)

display(df_with_constant)

EmpId,first_name,last_name,City,Age,Country,DiscountRate,TaxRate,InStock,TotalPriceWithTax,DiscountStr
101,Girish,Kumar,Bangalore,26,USA,5,18.5,True,30680.0,5
102,Ramesh,Rathod,Chennai,29,USA,5,18.5,True,34220.0,5
103,Somesh,Sekar,Hyderabad,32,USA,5,18.5,True,37760.0,5
104,Pradeep,Rao,Pune,25,USA,5,18.5,True,29500.0,5
105,Kamal,Prathap,Mumbai,27,USA,5,18.5,True,31860.0,5
106,Kishore,Kumar,Nasik,34,USA,5,18.5,True,40120.0,5
107,Hari,Shetty,,35,USA,5,18.5,True,41300.0,5
108,Basha,Azmath,,24,USA,5,18.5,True,28320.0,5


##### 2) Concatenating Strings (two columns)

In [0]:
# Concatenate first name and last name

display(df_emp.withColumn("FullName", concat(col("first_name"), lit(" "), col("last_name"))))

EmpId,first_name,last_name,City,Age,FullName
101,Girish,Kumar,Bangalore,26,Girish Kumar
102,Ramesh,Rathod,Chennai,29,Ramesh Rathod
103,Somesh,Sekar,Hyderabad,32,Somesh Sekar
104,Pradeep,Rao,Pune,25,Pradeep Rao
105,Kamal,Prathap,Mumbai,27,Kamal Prathap
106,Kishore,Kumar,Nasik,34,Kishore Kumar
107,Hari,Shetty,,35,Hari Shetty
108,Basha,Azmath,,24,Basha Azmath


In [0]:
# Sample data
data = [
    ("guid-001", 101, 1, 0, 10, 1001, "2025-01-01 10:00:00"),
    ("guid-002", 102, 2, 1, 11, 1002, "2025-02-15 15:30:00"),
    ("guid-003", 103, 1, 2, 12, 1003, "2025-03-20 09:45:00"),
    ("guid-004", 104, 3, 0, 13, 1004, "2025-04-11 10:00:00"),
    ("guid-005", 105, 4, 1, 14, 1005, "2021-09-25 18:39:00"),
    ("guid-006", 106, 6, 2, 15, 1006, "2024-06-29 02:45:00"),
    ("guid-007", 107, 9, 0, 16, 1007, "2022-08-30 19:45:00"),
    ("guid-008", 108, 2, 1, 17, 1008, "2023-11-17 25:30:00"),
    ("guid-009", 109, 7, 2, 18, 1009, "2024-12-19 29:45:00")
]

columns = [
    "sales_event_guid",
    "make_id",
    "sub_version_id",
    "item_subversion_id",
    "sales_priority_id",
    "vehicle_engine_profile_id",
    "period_start_timestamp"
]

# Create DataFrame
df_concat = spark.createDataFrame(data, columns)
display(df_concat)

sales_event_guid,make_id,sub_version_id,item_subversion_id,sales_priority_id,vehicle_engine_profile_id,period_start_timestamp
guid-001,101,1,0,10,1001,2025-01-01 10:00:00
guid-002,102,2,1,11,1002,2025-02-15 15:30:00
guid-003,103,1,2,12,1003,2025-03-20 09:45:00
guid-004,104,3,0,13,1004,2025-04-11 10:00:00
guid-005,105,4,1,14,1005,2021-09-25 18:39:00
guid-006,106,6,2,15,1006,2024-06-29 02:45:00
guid-007,107,9,0,16,1007,2022-08-30 19:45:00
guid-008,108,2,1,17,1008,2023-11-17 25:30:00
guid-009,109,7,2,18,1009,2024-12-19 29:45:00


In [0]:
# Create surrogate_key column using concat and lit
df_concat_lit = df_concat.select("*",
    concat(
        col("sales_event_guid"), lit("-"),
        col("make_id"), lit("-"),
        col("sub_version_id"), lit("-"),
        col("item_subversion_id"), lit("-"),
        col("sales_priority_id"), lit("-"),
        col("vehicle_engine_profile_id"), lit("-"),
        col("period_start_timestamp")
    ).alias("surrogate_key")
)

# Show result
display(df_concat_lit)

sales_event_guid,make_id,sub_version_id,item_subversion_id,sales_priority_id,vehicle_engine_profile_id,period_start_timestamp,surrogate_key
guid-001,101,1,0,10,1001,2025-01-01 10:00:00,guid-001-101-1-0-10-1001-2025-01-01 10:00:00
guid-002,102,2,1,11,1002,2025-02-15 15:30:00,guid-002-102-2-1-11-1002-2025-02-15 15:30:00
guid-003,103,1,2,12,1003,2025-03-20 09:45:00,guid-003-103-1-2-12-1003-2025-03-20 09:45:00
guid-004,104,3,0,13,1004,2025-04-11 10:00:00,guid-004-104-3-0-13-1004-2025-04-11 10:00:00
guid-005,105,4,1,14,1005,2021-09-25 18:39:00,guid-005-105-4-1-14-1005-2021-09-25 18:39:00
guid-006,106,6,2,15,1006,2024-06-29 02:45:00,guid-006-106-6-2-15-1006-2024-06-29 02:45:00
guid-007,107,9,0,16,1007,2022-08-30 19:45:00,guid-007-107-9-0-16-1007-2022-08-30 19:45:00
guid-008,108,2,1,17,1008,2023-11-17 25:30:00,guid-008-108-2-1-17-1008-2023-11-17 25:30:00
guid-009,109,7,2,18,1009,2024-12-19 29:45:00,guid-009-109-7-2-18-1009-2024-12-19 29:45:00


##### 3) Limitations with Complex Types
- creating columns with complex types:
  - array
  - struct
  - create_map

In [0]:
from pyspark.sql.functions import col, lit, array, struct, create_map

# Sample employee data
data = [
    (101, "Girish", 26),
    (102, "Ramesh", 29),
    (103, "Somesh", 32),
    (104, "Sirisha", 39),
    (105, "Kishore", 23),
    (106, "Akash", 35)
]
columns = ["EmpId", "Name", "Age"]

df_emp = spark.createDataFrame(data, columns)
display(df_emp)

EmpId,Name,Age
101,Girish,26
102,Ramesh,29
103,Somesh,32
104,Sirisha,39
105,Kishore,23
106,Akash,35


##### a) Creating an array column with lit()

In [0]:
df_with_array = df_emp.withColumn(
    "Skills",
    array(lit("Python"), lit("SQL"), lit("Spark"))
)
display(df_with_array)

EmpId,Name,Age,Skills
101,Girish,26,"List(Python, SQL, Spark)"
102,Ramesh,29,"List(Python, SQL, Spark)"
103,Somesh,32,"List(Python, SQL, Spark)"
104,Sirisha,39,"List(Python, SQL, Spark)"
105,Kishore,23,"List(Python, SQL, Spark)"
106,Akash,35,"List(Python, SQL, Spark)"


##### b) Creating a struct column with lit()

In [0]:
df_with_struct = df_emp.withColumn(
    "Address",
    struct(
        lit("Bangalore").alias("City"),
        lit("Karnataka").alias("State"),
        lit(560001).alias("Pincode")
    )
)
display(df_with_struct)

EmpId,Name,Age,Address
101,Girish,26,"List(Bangalore, Karnataka, 560001)"
102,Ramesh,29,"List(Bangalore, Karnataka, 560001)"
103,Somesh,32,"List(Bangalore, Karnataka, 560001)"
104,Sirisha,39,"List(Bangalore, Karnataka, 560001)"
105,Kishore,23,"List(Bangalore, Karnataka, 560001)"
106,Akash,35,"List(Bangalore, Karnataka, 560001)"


##### c) Creating a map column with lit()

In [0]:
from pyspark.sql.functions import create_map, lit

df_with_map = df_emp.withColumn(
    "Properties",
    create_map(
        lit("Department"), lit("IT").cast("string"),
        lit("Level"), lit("Senior").cast("string"),
        lit("Experience"), lit(5).cast("string")
    )
)
display(df_with_map)

EmpId,Name,Age,Properties
101,Girish,26,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
102,Ramesh,29,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
103,Somesh,32,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
104,Sirisha,39,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
105,Kishore,23,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
106,Akash,35,"Map(Department -> IT, Level -> Senior, Experience -> 5)"


##### d) All in one DataFrame

In [0]:
from pyspark.sql.functions import array, lit, struct, create_map

df_all_complex = (
    df_emp
    .withColumn("Skills", array(lit("Python"), lit("SQL"), lit("Spark")))
    .withColumn("Address", struct(lit("Bangalore").alias("City"), lit("Karnataka").alias("State"), lit(560001).alias("Pincode")))
    .withColumn("Properties", create_map(lit("Department"), lit("IT"), lit("Level"), lit("Senior"), lit("Experience"), lit("5")))
)

display(df_all_complex)

EmpId,Name,Age,Skills,Address,Properties
101,Girish,26,"List(Python, SQL, Spark)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
102,Ramesh,29,"List(Python, SQL, Spark)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
103,Somesh,32,"List(Python, SQL, Spark)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
104,Sirisha,39,"List(Python, SQL, Spark)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
105,Kishore,23,"List(Python, SQL, Spark)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
106,Akash,35,"List(Python, SQL, Spark)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"


##### 3) Assigning default values

In [0]:
from pyspark.sql.functions import col, lit, coalesce, array, struct, create_map
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType

# Define schema explicitly so Struct column is actually StructType
schema = StructType([
    StructField("EmpId", IntegerType()),
    StructField("Name", StringType()),
    StructField("City", StringType()),
    StructField("Age", IntegerType()),
    StructField("Array", ArrayType(StringType())),
    StructField("Struct", StructType([
        StructField("City", StringType()),
        StructField("State", StringType()),
        StructField("Pincode", StringType())
    ])),
    StructField("map", MapType(StringType(), StringType()))
])

# Data with Struct as tuple (to match StructType) instead of dict
data = [
    (101, "Girish", "Bangalore", 26, ["Python","SQL","Spark"], ("Bangalore","Karnataka","560001"), None),
    (102, "Ramesh", None, 29, None, ("Bangalore","Karnataka","560001"), None),
    (103, "Somesh", "Hyderabad", None, ["Python","SQL","PySpark"], None, {"Department":"IT","Level":"Senior","Experience":"5"}),
    (104, "Pradeep", None, None, ["Git","SQL","Spark"], None, {"Department":"IT","Level":"Senior","Experience":"5"}),
    (105, "Kamal", "Mumbai", 27, None, ("Bangalore","Karnataka","560001"), None),
    (106, "Umesh", "Bangalore", 28, ["Python","Devops","PySpark"], None, {"Department":"IT","Level":"Senior","Experience":"5"}),
    (107, "Bobby", None, 32, None, ("Bangalore","Karnataka","560001"), None),
    (108, "Sandhya", "Chennai", None, ["Python","VS Code","SparkSQL"], None, {"Department":"IT","Level":"Senior","Experience":"5"}),
    (109, "Piyush", None, None, None, ("Bangalore","Karnataka","560001"), None),
    (110, "Goel", "Mumbai", 31, ["AWS","Azure","databricks"], None, {"Department":"IT","Level":"Senior","Experience":"5"})
]

df_emp = spark.createDataFrame(data, schema=schema)
display(df_emp)

EmpId,Name,City,Age,Array,Struct,map
101,Girish,Bangalore,26.0,"List(Python, SQL, Spark)","List(Bangalore, Karnataka, 560001)",
102,Ramesh,,29.0,,"List(Bangalore, Karnataka, 560001)",
103,Somesh,Hyderabad,,"List(Python, SQL, PySpark)",,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
104,Pradeep,,,"List(Git, SQL, Spark)",,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
105,Kamal,Mumbai,27.0,,"List(Bangalore, Karnataka, 560001)",
106,Umesh,Bangalore,28.0,"List(Python, Devops, PySpark)",,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
107,Bobby,,32.0,,"List(Bangalore, Karnataka, 560001)",
108,Sandhya,Chennai,,"List(Python, VS Code, SparkSQL)",,"Map(Department -> IT, Level -> Senior, Experience -> 5)"
109,Piyush,,,,"List(Bangalore, Karnataka, 560001)",
110,Goel,Mumbai,31.0,"List(AWS, Azure, databricks)",,"Map(Department -> IT, Level -> Senior, Experience -> 5)"


In [0]:
# Applying coalesce with struct default
df_with_defaults = (
    df_emp
    .withColumn("City", coalesce(col("City"), lit("Unknown")))
    .withColumn("Age", coalesce(col("Age"), lit(30)))
    .withColumn(
        "Array",
        coalesce(col("Array"), array(lit("databricks"), lit("scala"), lit("pycharm")))
    )
    .withColumn(
        "Struct",
        coalesce(
            col("Struct"),
            struct(
                lit("Indore").alias("City"),
                lit("Delhi").alias("State"),
                lit("569581").alias("Pincode")
            )
        )
    )
    .withColumn(
        "map",
        coalesce(
            col("map"),
            create_map(
                lit("Department"), lit("IT"),
                lit("Level"), lit("Senior"),
                lit("Experience"), lit("5")
            )
        )
    )
)

display(df_with_defaults)

EmpId,Name,City,Age,Array,Struct,map
101,Girish,Bangalore,26,"List(Python, SQL, Spark)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
102,Ramesh,Unknown,29,"List(databricks, scala, pycharm)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
103,Somesh,Hyderabad,30,"List(Python, SQL, PySpark)","List(Indore, Delhi, 569581)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
104,Pradeep,Unknown,30,"List(Git, SQL, Spark)","List(Indore, Delhi, 569581)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
105,Kamal,Mumbai,27,"List(databricks, scala, pycharm)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
106,Umesh,Bangalore,28,"List(Python, Devops, PySpark)","List(Indore, Delhi, 569581)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
107,Bobby,Unknown,32,"List(databricks, scala, pycharm)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
108,Sandhya,Chennai,30,"List(Python, VS Code, SparkSQL)","List(Indore, Delhi, 569581)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
109,Piyush,Unknown,30,"List(databricks, scala, pycharm)","List(Bangalore, Karnataka, 560001)","Map(Department -> IT, Level -> Senior, Experience -> 5)"
110,Goel,Mumbai,31,"List(AWS, Azure, databricks)","List(Indore, Delhi, 569581)","Map(Department -> IT, Level -> Senior, Experience -> 5)"


- **coalesce(col("City"), lit("Unknown"))**
  - If City is **null**, replace it with **"Unknown"**.
  
- **coalesce(col("Age"), lit(30))**
  - If **Age** is **null**, replace it with **30**.