## Data Transformations

### Spark session initiation

In [1]:
import findspark
findspark.init(spark_home="/home/prabhakar/mybin/spark-3.0.2-bin-hadoop2.7-hive1.2")


from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark Data Transformations").getOrCreate()

configs = spark.sparkContext.getConf().getAll()
for key, value in configs:
    print(f"{key}: {value}")

# Get the Spark UI URL
spark_ui_url = spark.sparkContext.uiWebUrl

print(f"Spark Job URL: {spark_ui_url}")

25/08/28 15:58:08 WARN Utils: Your hostname, DESKTOP-PFNTFJ1 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/28 15:58:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/08/28 15:58:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


spark.driver.port: 45409
spark.rdd.compress: True
spark.app.id: local-1756396693342
spark.serializer.objectStreamReset: 100
spark.master: local[*]
spark.submit.pyFiles: 
spark.app.startTime: 1756396691943
spark.executor.id: driver
spark.submit.deployMode: client
spark.app.name: PySpark Data Transformations
spark.driver.host: 10.255.255.254
spark.ui.showConsoleProgress: true
Spark Job URL: http://10.255.255.254:4040


## UDF - User Defined Functions

In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Sample DataFrame
data = [("1", "john doe"), ("2", "jane smith")]
df = spark.createDataFrame(data, ["id", "name"])

# Define a UDF to capitalize names
# This is a python code and not a pyspark code

# Drawbacks:
# UDF wil be called for each record in the dataframe
# UDF is a blackbox for pyspark and may ont be optimised well.
def capitalize_name(name):
   return name.title()

# Registering the udf
# Higher Order Functions ==> Functions which take another function as arguments
capitalize_udf = udf(capitalize_name, StringType())
# capitalize_udf = udf(lambda: title(name))

# Apply UDF
df.withColumn("capitalized_name", capitalize_udf(df["name"])).show()

+---+----------+----------------+
| id|      name|capitalized_name|
+---+----------+----------------+
|  1|  john doe|        John Doe|
|  2|jane smith|      Jane Smith|
+---+----------+----------------+



Why UDFs Are Inefficient

Lack of Optimization: PySpark UDFs are treated as black boxes by Spark's Catalyst optimizer. This prevents Spark from applying query optimizations like predicate pushdown or column pruning.

Serialization Overhead: When using Python UDFs, data must be serialized and transferred between the JVM (Spark engine) and Python runtime. This process involves significant overhead.

Row-by-Row Execution: UDFs process data row by row, which is slower compared to vectorized operations provided by Spark's built-in functions.

Null Handling Issues: UDFs require explicit null handling. Failing to do so can lead to runtime errors.

### String Functions

In [10]:
from pyspark.sql.functions import lower, upper, col
df.withColumn("upper_name", upper(col("name"))).show()
df.withColumn("upper_name", upper(df["name"])).show()


df.withColumn("upper_name", upper(df["name"])).withColumn("lower_name", lower(col("upper_name"))).show()

+---+----------+----------+
| id|      name|upper_name|
+---+----------+----------+
|  1|  john doe|  JOHN DOE|
|  2|jane smith|JANE SMITH|
+---+----------+----------+

+---+----------+----------+
| id|      name|upper_name|
+---+----------+----------+
|  1|  john doe|  JOHN DOE|
|  2|jane smith|JANE SMITH|
+---+----------+----------+

+---+----------+----------+----------+
| id|      name|upper_name|lower_name|
+---+----------+----------+----------+
|  1|  john doe|  JOHN DOE|  john doe|
|  2|jane smith|JANE SMITH|jane smith|
+---+----------+----------+----------+



### Date Time Manipulations

In [32]:
from pyspark.sql.functions import current_date, unix_timestamp, to_date, from_unixtime
from pyspark.sql.types import IntegerType
df.withColumn("curr_date", current_date()).show()

df.withColumn("epoch_time", unix_timestamp()).show()

df.withColumn("epoch_time", unix_timestamp()). \
withColumn("curr_timestamp", from_unixtime(col("epoch_time"))). \
withColumn("curr_date", to_date(col("curr_timestamp"))). \
show()

+---+----------+----------+
| id|      name| curr_date|
+---+----------+----------+
|  1|  john doe|2025-08-27|
|  2|jane smith|2025-08-27|
+---+----------+----------+

+---+----------+----------+
| id|      name|epoch_time|
+---+----------+----------+
|  1|  john doe|1756314739|
|  2|jane smith|1756314739|
+---+----------+----------+

+---+----------+----------+-------------------+----------+
| id|      name|epoch_time|     curr_timestamp| curr_date|
+---+----------+----------+-------------------+----------+
|  1|  john doe|1756314739|2025-08-27 17:12:19|2025-08-27|
|  2|jane smith|1756314739|2025-08-27 17:12:19|2025-08-27|
+---+----------+----------+-------------------+----------+



### Numeric Functions

In [37]:
from pyspark.sql.functions import abs, sqrt
df.select(sqrt(col("id"))).show()
df.withColumn("sqrt_value", sqrt(col("id"))).select("id", "sqrt_value").show()

+------------------+
|          SQRT(id)|
+------------------+
|               1.0|
|1.4142135623730951|
+------------------+

+---+------------------+
| id|        sqrt_value|
+---+------------------+
|  1|               1.0|
|  2|1.4142135623730951|
+---+------------------+



### Conditional Expressions

In [41]:
from pyspark.sql.functions import when
df.withColumn("capitalize_name", when(col("id") > 1, upper(col("name"))).otherwise(col("name"))).show()

+---+----------+---------------+
| id|      name|capitalize_name|
+---+----------+---------------+
|  1|  john doe|       john doe|
|  2|jane smith|     JANE SMITH|
+---+----------+---------------+



### Type casting

In [50]:
from pyspark.sql.types import StringType, IntegerType, LongType
df.printSchema()

# convert id String type to number type
df.withColumn("id_number_type", col("id").cast(IntegerType())).printSchema()
df2 = df.select(col("id").cast(IntegerType()), col("name"))

df.printSchema()
df2.printSchema()


df. \
withColumn("id_number_type", col("id").cast(IntegerType())). \
drop(col("id")). \
withColumnRenamed("id_number_type", "id"). \
printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- id_number_type: integer (nullable = true)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)

root
 |-- name: string (nullable = true)
 |-- id: integer (nullable = true)



In [None]:

from pyspark.sql.functions import explode

# Create Spark session
spark = SparkSession.builder.appName("ExplodeExample").getOrCreate()

# Sample data with array column
data = [
    ("Alice", ["Python", "SQL"]),
    ("Bob", ["Java", "Scala"]),
    ("Charlie", ["C++", "Go", "Rust"])
]

# Create DataFrame
df = spark.createDataFrame(data, ["name", "languages"])

# Apply explode to 'languages' column
exploded_df = df.select("name", explode("languages").alias("language"))

# Show result
exploded_df.show()


+-------+--------+
|   name|language|
+-------+--------+
|  Alice|  Python|
|  Alice|     SQL|
|    Bob|    Java|
|    Bob|   Scala|
|Charlie|     C++|
|Charlie|      Go|
|Charlie|    Rust|
+-------+--------+



In [53]:
data = [(1, 2), (1, 3), (2, 4)]
df = spark.createDataFrame(data, ["group", "value"])
df.show()                                                                  
from pyspark.sql.functions import udf, collect_list
from pyspark.sql.types import LongType

def sum_of_squares(values):
    return sum(x * x for x in values)

sum_of_squares_udf = udf(sum_of_squares, LongType())      
df_grouped = df.groupBy("group").agg(collect_list("value").alias("values"))
df_result = df_grouped.withColumn("sum_squares", sum_of_squares_udf("values"))
df_result.show()




+-----+-----+
|group|value|
+-----+-----+
|    1|    2|
|    1|    3|
|    2|    4|
+-----+-----+



                                                                                

+-----+------+-----------+
|group|values|sum_squares|
+-----+------+-----------+
|    1|[2, 3]|         13|
|    2|   [4]|         16|
+-----+------+-----------+



                                                                                

In [64]:
### parsed json string 

from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType



# Sample JSON string
data = [("1", '{"name":"Alice","city":"Mumbai"}')]
df = spark.createDataFrame(data, ["id", "json_str"])

# Define schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("city", StringType(), True)
])

# Parse JSON string
parsed_df = df.withColumn("parsed", from_json("json_str", schema))
parsed_df.select("id", "parsed.*").show()
parsed_df.select("id", "parsed").show()
parsed_df.select("id", "parsed.*").printSchema()
parsed_df.select("id", "parsed").printSchema()

parsed_df.printSchema()



+---+-----+------+
| id| name|  city|
+---+-----+------+
|  1|Alice|Mumbai|
+---+-----+------+

+---+---------------+
| id|         parsed|
+---+---------------+
|  1|[Alice, Mumbai]|
+---+---------------+

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- parsed: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- city: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- json_str: string (nullable = true)
 |-- parsed: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- city: string (nullable = true)



In [65]:
#extracting JSON fields







from pyspark.sql.functions import get_json_object

df.select(
    "id",
    get_json_object("json_str", "$.name").alias("name"),
    get_json_object("json_str", "$.city").alias("city")
).show()


+---+-----+------+
| id| name|  city|
+---+-----+------+
|  1|Alice|Mumbai|
+---+-----+------+



In [68]:
from pyspark.sql.functions import json_tuple

df.select("id", json_tuple("json_str", "name", "city")).printSchema() #.toDF("id", "name", "city").show()
df.select("id", json_tuple("json_str", "name", "city")).toDF("id", "name", "city").printSchema()


root
 |-- id: string (nullable = true)
 |-- c0: string (nullable = true)
 |-- c1: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)



In [69]:
##Converting STRUCT to JSON with to_json()



from pyspark.sql.functions import to_json

# Assuming 'parsed' column is a struct
jsonified_df = parsed_df.withColumn("json_back", to_json("parsed"))
jsonified_df.select("id", "json_back").show()


+---+--------------------+
| id|           json_back|
+---+--------------------+
|  1|{"name":"Alice","...|
+---+--------------------+

