### spark-sql   

In [None]:
import findspark
findspark.init(spark_home="/home/prabhakar/mybin/spark-3.0.2-bin-hadoop2.7-hive1.2")


from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark Handling Nulls Practice").getOrCreate()


configs = spark.sparkContext.getConf().getAll()
for key, value in configs:
    print(f"{key}: {value}")

# Get the Spark UI URL
spark_ui_url = spark.sparkContext.uiWebUrl

print(f"Spark Job URL: {spark_ui_url}")

25/08/30 13:20:10 WARN Utils: Your hostname, DESKTOP-PFNTFJ1 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/30 13:20:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/08/30 13:20:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


spark.rdd.compress: True
spark.app.name: PySpark Handling Nulls Practice
spark.app.id: local-1756560014825
spark.serializer.objectStreamReset: 100
spark.master: local[*]
spark.submit.pyFiles: 
spark.executor.id: driver
spark.submit.deployMode: client
spark.app.startTime: 1756560013393
spark.driver.host: 10.255.255.254
spark.ui.showConsoleProgress: true
spark.driver.port: 37661
Spark Job URL: http://10.255.255.254:4040


In [None]:

from pyspark.sql import functions as F

# 2. Create sample data (simulate skew + multiple partitions)
data = [
    ("2025-08-01", "A", 10),
    ("2025-08-01", "A", 15),
    ("2025-08-01", "A", 20),
    ("2025-08-01", "B", 5),
    ("2025-08-01", "B", 30),
    ("2025-08-02", "C", 25),
    ("2025-08-02", "C", 40),
    ("2025-08-02", "D", 50),
    ("2025-08-02", "E", 60),
    ("2025-08-03", "A", 70),
    ("2025-08-03", "F", 80),
    ("2025-08-03", "G", 90),
    ("2025-08-03", "H", 100),
]

# 3. Define schema and create DataFrame
df = spark.createDataFrame(data, ["date", "category", "value"])

print("InitVial DataFrame:")
df.show()

# 4. Check number of partitions
print("Number of partitions =", df.rdd.getNumPartitions())

# 5. Repartition example (increase parallelism)
df_repart = df.repartition(6, "date")  # shuffle by "date"
print("Repartitioned DataFrame:")
print("Partitions =", df_repart.rdd.getNumPartitions())

# 6. Coalesce example (reduce parallelism, no shuffle)
df_coalesce = df.coalesce(2)
print("Coalesced DataFrame:")
print("Partitions =", df_coalesce.rdd.getNumPartitions())

# 7. Writing with controlled file counts
# (writes only 1 file per date partition)
df_repart.write.mode("overwrite").partitionBy("date").parquet("output/repartitioned")


Initial DataFrame:


                                                                                

+----------+--------+-----+
|      date|category|value|
+----------+--------+-----+
|2025-08-01|       A|   10|
|2025-08-01|       A|   15|
|2025-08-01|       A|   20|
|2025-08-01|       B|    5|
|2025-08-01|       B|   30|
|2025-08-02|       C|   25|
|2025-08-02|       C|   40|
|2025-08-02|       D|   50|
|2025-08-02|       E|   60|
|2025-08-03|       A|   70|
|2025-08-03|       F|   80|
|2025-08-03|       G|   90|
|2025-08-03|       H|  100|
+----------+--------+-----+

Number of partitions = 8
Repartitioned DataFrame:
Partitions = 6
Coalesced DataFrame:
Partitions = 2


                                                                                

In [4]:
df.createOrReplaceTempView("modifiedtable" )

In [7]:
spark.sql("select date from modifiedtable")

DataFrame[date: string]

In [None]:
spark.sql("select * from modifiedtable").show()
spark.sql("select * from modifiedtable").printSchema()

prabharkar_sql = spark.sql("select date from modifiedtable")
prabharkar_sql.show()
prabharkar_sql.printSchema()

# prabharkar_sql.join(....)

prabharkar_sql.createOrReplaceTempView("one_more_modified")

spark.sql("select * from one_more_modified").show()


+----------+--------+-----+
|      date|category|value|
+----------+--------+-----+
|2025-08-01|       A|   10|
|2025-08-01|       A|   15|
|2025-08-01|       A|   20|
|2025-08-01|       B|    5|
|2025-08-01|       B|   30|
|2025-08-02|       C|   25|
|2025-08-02|       C|   40|
|2025-08-02|       D|   50|
|2025-08-02|       E|   60|
|2025-08-03|       A|   70|
|2025-08-03|       F|   80|
|2025-08-03|       G|   90|
|2025-08-03|       H|  100|
+----------+--------+-----+

root
 |-- date: string (nullable = true)
 |-- category: string (nullable = true)
 |-- value: long (nullable = true)

+----------+
|      date|
+----------+
|2025-08-01|
|2025-08-01|
|2025-08-01|
|2025-08-01|
|2025-08-01|
|2025-08-02|
|2025-08-02|
|2025-08-02|
|2025-08-02|
|2025-08-03|
|2025-08-03|
|2025-08-03|
|2025-08-03|
+----------+

root
 |-- date: string (nullable = true)

+----------+
|      date|
+----------+
|2025-08-01|
|2025-08-01|
|2025-08-01|
|2025-08-01|
|2025-08-01|
|2025-08-02|
|2025-08-02|
|2025-08-02|

### Spark sql UDF

In [20]:
def square_of_value(value):
   return value * value

# Registering the udf
# Higher Order Functions ==> Functions which take another function as arguments
spark.udf.register("square_of_value_udf",square_of_value)

spark.sql("select value, square_of_value_udf(value) from modifiedtable").show()

25/08/30 14:22:39 WARN SimpleFunctionRegistry: The function square_of_value_udf replaced a previously registered function.


+-----+--------------------------+
|value|square_of_value_udf(value)|
+-----+--------------------------+
|   10|                       100|
|   15|                       225|
|   20|                       400|
|    5|                        25|
|   30|                       900|
|   25|                       625|
|   40|                      1600|
|   50|                      2500|
|   60|                      3600|
|   70|                      4900|
|   80|                      6400|
|   90|                      8100|
|  100|                     10000|
+-----+--------------------------+



### Using SQL Functions in Dataframes

In [None]:
df.show()

# Spark Datagrame DSL
df2 = df.withColumn("lowered_category", F.lower(df["category"]))

# df3 = df.withColumn("lowered_category", F.lower(df["category"])).show()
# print(type(df2))
# print(type(df3))

# By Using SQL function in Dataframe DSL
df2.withColumn("upper_category", F.expr("upper(lowered_category)")).show()


+----------+--------+-----+
|      date|category|value|
+----------+--------+-----+
|2025-08-01|       A|   10|
|2025-08-01|       A|   15|
|2025-08-01|       A|   20|
|2025-08-01|       B|    5|
|2025-08-01|       B|   30|
|2025-08-02|       C|   25|
|2025-08-02|       C|   40|
|2025-08-02|       D|   50|
|2025-08-02|       E|   60|
|2025-08-03|       A|   70|
|2025-08-03|       F|   80|
|2025-08-03|       G|   90|
|2025-08-03|       H|  100|
+----------+--------+-----+

+----------+--------+-----+----------------+
|      date|category|value|lowered_category|
+----------+--------+-----+----------------+
|2025-08-01|       A|   10|               a|
|2025-08-01|       A|   15|               a|
|2025-08-01|       A|   20|               a|
|2025-08-01|       B|    5|               b|
|2025-08-01|       B|   30|               b|
|2025-08-02|       C|   25|               c|
|2025-08-02|       C|   40|               c|
|2025-08-02|       D|   50|               d|
|2025-08-02|       E|   60| 