#### **lit()**

- PySpark 𝐥𝐢𝐭() function is used to **add constant or literal value** as a **new column** to the DataFrame.

- We can also use this function to derive the **new column** based on **some conditions**.

In [0]:
from pyspark.sql.functions import lit, col, when, current_timestamp, to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType
from datetime import datetime

#### **EX 01**

In [0]:
data =[{'rollno':'01', 'name':'sravan', 'age':23, 'height':5.79, 'weight':67, 'address':'Guntur'},
       {'rollno':'02', 'name':'ojaswi', 'age':26, 'height':3.79, 'weight':34, 'address':'Hyderabad'},
       {'rollno':'03', 'name':'gnanesh', 'age':37, 'height':2.79, 'weight':37, 'address':'Chennai'},
       {'rollno':'04', 'name':'rohith', 'age':29, 'height':3.69, 'weight':28, 'address':'Bangalore'},
       {'rollno':'05', 'name':'sridevi', 'age':45, 'height':5.59, 'weight':54, 'address':'Hyderabad'},
       {'rollno':'01', 'name':'Amit Mishra', 'age':26, 'height':5.79, 'weight':67, 'address':'Delhi'},
       {'rollno':'02', 'name':'Niraj Guptha', 'age':56, 'height':4.99, 'weight':34, 'address':'Mumbai'},
       {'rollno':'03', 'name':'Sridharan', 'age':57, 'height':3.79, 'weight':47, 'address':'Delhi'},
       {'rollno':'04', 'name':'Kiran', 'age':49, 'height':4.69, 'weight':38, 'address':'Bangalore'},
       {'rollno':'05', 'name':'Dhiraj', 'age':42, 'height':6.00, 'weight':34, 'address':'Nasik'},
       {'rollno':'05', 'name':'Dhiraj', 'age':42, 'height':6.00, 'weight':34, 'address':'Kolkata'},
       {'rollno':'05', 'name':'Dhiraj', 'age':42, 'height':6.00, 'weight':34, 'address':'Gurgaon'}]

# create the dataframe
df = spark.createDataFrame(data)
df.show(truncate=False)
df.printSchema()

+---------+---+------+------------+------+------+
|address  |age|height|name        |rollno|weight|
+---------+---+------+------------+------+------+
|Guntur   |23 |5.79  |sravan      |01    |67    |
|Hyderabad|26 |3.79  |ojaswi      |02    |34    |
|Chennai  |37 |2.79  |gnanesh     |03    |37    |
|Bangalore|29 |3.69  |rohith      |04    |28    |
|Hyderabad|45 |5.59  |sridevi     |05    |54    |
|Delhi    |26 |5.79  |Amit Mishra |01    |67    |
|Mumbai   |56 |4.99  |Niraj Guptha|02    |34    |
|Delhi    |57 |3.79  |Sridharan   |03    |47    |
|Bangalore|49 |4.69  |Kiran       |04    |38    |
|Nasik    |42 |6.0   |Dhiraj      |05    |34    |
|Kolkata  |42 |6.0   |Dhiraj      |05    |34    |
|Gurgaon  |42 |6.0   |Dhiraj      |05    |34    |
+---------+---+------+------------+------+------+

root
 |-- address: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: double (nullable = true)
 |-- name: string (nullable = true)
 |-- rollno: string (nullable = true)
 |-- weigh

##### a) Adding Constant Column
- PySpark lit() function is used to **add constant or literal value** as a **new column** to the DataFrame.

In [0]:
# add a new column: "source_id"
df = df.select("*", lit(2).alias("source_id"))

# display the final dataframe
df.show(truncate=False)

+---------+---+------+------------+------+------+---------+
|address  |age|height|name        |rollno|weight|source_id|
+---------+---+------+------------+------+------+---------+
|Guntur   |23 |5.79  |sravan      |01    |67    |2        |
|Hyderabad|26 |3.79  |ojaswi      |02    |34    |2        |
|Chennai  |37 |2.79  |gnanesh     |03    |37    |2        |
|Bangalore|29 |3.69  |rohith      |04    |28    |2        |
|Hyderabad|45 |5.59  |sridevi     |05    |54    |2        |
|Delhi    |26 |5.79  |Amit Mishra |01    |67    |2        |
|Mumbai   |56 |4.99  |Niraj Guptha|02    |34    |2        |
|Delhi    |57 |3.79  |Sridharan   |03    |47    |2        |
|Bangalore|49 |4.69  |Kiran       |04    |38    |2        |
|Nasik    |42 |6.0   |Dhiraj      |05    |34    |2        |
|Kolkata  |42 |6.0   |Dhiraj      |05    |34    |2        |
|Gurgaon  |42 |6.0   |Dhiraj      |05    |34    |2        |
+---------+---+------+------------+------+------+---------+



In [0]:
# add a new column: "source_id"
df = df.select("*", lit(datetime.now()).alias("Today's Date"),
                    lit('').alias('vehicle_description'),
                    to_timestamp(lit("1900-01-01 00:00:00"),'yyyy-MM-dd HH:mm:ss').alias('valid_from_datetime'),
                    to_timestamp(lit("9999-12-31 23:59:59"),'yyyy-MM-dd HH:mm:ss').alias('valid_to_datetime'))

# display the final dataframe
display(df)

In [0]:
# add a new column: "PinCode City from address column
df = df.select("*", lit(df.address).alias("PinCode_City"))

# display the final dataframe
display(df)

address,age,height,name,rollno,weight,source_id,PinCode_City
Guntur,23,5.79,sravan,1,67,2,Guntur
Hyderabad,26,3.79,ojaswi,2,34,2,Hyderabad
Chennai,37,2.79,gnanesh,3,37,2,Chennai
Bangalore,29,3.69,rohith,4,28,2,Bangalore
Hyderabad,45,5.59,sridevi,5,54,2,Hyderabad
Delhi,26,5.79,Amit Mishra,1,67,2,Delhi
Mumbai,56,4.99,Niraj Guptha,2,34,2,Mumbai
Delhi,57,3.79,Sridharan,3,47,2,Delhi
Bangalore,49,4.69,Kiran,4,38,2,Bangalore
Nasik,42,6.0,Dhiraj,5,34,2,Nasik


##### b) withColumn

In [0]:
df = df.withColumn("PinCode", when((col("address") == "Guntur"), lit("522002")). \
                              when((col("address") == "Hyderabad"), lit("500001")). \
                              when((col("address") == "Chennai"), lit("600011")). \
                              when((col("address") == "Bangalore"), lit("560001")). \
                              when((col("address") == "Delhi"), lit("110006")). \
                              when((col("address") == "Mumbai"), lit("400001")). \
                              otherwise(lit("402343")))
display(df)

address,age,height,name,rollno,weight,source_id,PinCode_City,PinCode
Guntur,23,5.79,sravan,1,67,2,Guntur,522002
Hyderabad,26,3.79,ojaswi,2,34,2,Hyderabad,500001
Chennai,37,2.79,gnanesh,3,37,2,Chennai,600011
Bangalore,29,3.69,rohith,4,28,2,Bangalore,560001
Hyderabad,45,5.59,sridevi,5,54,2,Hyderabad,500001
Delhi,26,5.79,Amit Mishra,1,67,2,Delhi,110006
Mumbai,56,4.99,Niraj Guptha,2,34,2,Mumbai,400001
Delhi,57,3.79,Sridharan,3,47,2,Delhi,110006
Bangalore,49,4.69,Kiran,4,38,2,Bangalore,560001
Nasik,42,6.0,Dhiraj,5,34,2,Nasik,402343


#### **EX 02**

In [0]:
df1 = spark.read.csv("/FileStore/tables/StructType-5.csv", header=True, inferSchema=True)
display(df1)
df1.printSchema()
print("Total Number of Rows: ", df1.count())
print("List of Column Names: ", df1.columns)
print("No of Columns in dataset: ", len(df1.columns))

Id,Nick_Name,First_Name,Last_Name,Type,Age
1,admin,John,Victor,Grade1,30
2,everest,Paul,Irish,Grade2,35
3,moon,Erram,Rammohan,Enginner1,29
4,service,Stalin,Rajesh,Minister,40
5,Builder,Golla,Rajasekar,Builder,43
6,Drinker,Karjala,Hari,Army,33
7,Army,Koyi,Damodar,Bettalian,37
8,Marketing,Vemparla,Harish,Manager,55
9,Politician,Devineni,Umesh,Senior,58
10,Minister,Ponguru,Narayana,Education,56


root
 |-- Id: integer (nullable = true)
 |-- Nick_Name: string (nullable = true)
 |-- First_Name: string (nullable = true)
 |-- Last_Name: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Age: integer (nullable = true)

Total Number of Rows:  17
List of Column Names:  ['Id', 'Nick_Name', 'First_Name', 'Last_Name', 'Type', 'Age']
No of Columns in dataset:  6


In [0]:
df1 = df1.select("*", lit(2).cast(LongType()).alias('source_id'),
                      lit('').alias('vehicle_buy_or_sell'),
                      lit('').alias('vehicle_description'),
                      lit('').alias('delivery_status'),
                      lit('').alias('vehicle_classification_id'),
                      lit('').alias('vehicle_product_type'),
                      lit('').alias('pricing_model_id'),
                      lit('').alias('vehicle_agreement_id'),
                      lit('').alias('commercial_vehicle_venue_id'),
                      to_timestamp(lit("1999-01-01 00:00:00"),'yyyy-MM-dd HH:mm:ss').alias('valid_from_datetime'),
                      to_timestamp(lit("2023-12-31 23:59:59"),'yyyy-MM-dd HH:mm:ss').alias('valid_to_datetime'),
                      lit(datetime.now()).alias("Today's Date"),
                      current_timestamp().alias("created_datetime"),
                      current_timestamp().alias("updated_datetime")
                )

display(df1)

Id,Nick_Name,First_Name,Last_Name,Type,Age,source_id,vehicle_buy_or_sell,vehicle_description,delivery_status,vehicle_classification_id,vehicle_product_type,pricing_model_id,vehicle_agreement_id,commercial_vehicle_venue_id,valid_from_datetime,valid_to_datetime,Today's Date,created_datetime,updated_datetime
1,admin,John,Victor,Grade1,30,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
2,everest,Paul,Irish,Grade2,35,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
3,moon,Erram,Rammohan,Enginner1,29,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
4,service,Stalin,Rajesh,Minister,40,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
5,Builder,Golla,Rajasekar,Builder,43,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
6,Drinker,Karjala,Hari,Army,33,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
7,Army,Koyi,Damodar,Bettalian,37,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
8,Marketing,Vemparla,Harish,Manager,55,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
9,Politician,Devineni,Umesh,Senior,58,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
10,Minister,Ponguru,Narayana,Education,56,2,,,,,,,,,1999-01-01T00:00:00.000+0000,2023-12-31T23:59:59.000+0000,2024-06-13T17:45:07.801+0000,2024-06-13T17:45:08.299+0000,2024-06-13T17:45:08.299+0000
