In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

if __name__ == "__main__":

   spark = (
      SparkSession.builder
      .appName("spark-sql")
      .master ("local[*]")
      .getOrCreate()
   )

   spark_df = spark.read.csv(
      path = r"C:\Users\shubh\OneDrive\Desktop\validating data.csv",
        header = True,
        inferSchema = True  
   )

   spark_df.show(5,truncate = False)

   #spark sql only runs on view or table and spark can help you convert df to view

   spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
   spark_view = spark_df.createOrReplaceTempView("spark_view")    

   spark.sql(
      """
      WITH transformed_doj AS (
         SELECT
               *,
               to_date(doj, 'MM/dd/yyyy') AS transformed_doj
         FROM spark_view
      )
      SELECT
         booking_route,
         COUNT(*) AS booking_count
      FROM transformed_doj
      WHERE age BETWEEN 18 AND 30
      GROUP BY booking_route
      ORDER BY booking_count DESC
      LIMIT 10
      """
   ).show()

   # the spark sql and data frame apis are perfoming on top of spark sql engine which acts as catalyst
   # The catalyst :
      # analyse the code and checkes for errors
      # optimizes the code and generates the execution plan
      # executes the code and returns the result
   # The catalyst is the core of spark sql and it is responsible for query optimization and execution



+----------+----------+-------+------------+------------------------------+----------------+----------+----+----------+----------+------+----+
|booking_id|doj       |day    |booking_date|booking_route                 |route_type      |seat_count|fare|ry_user_id|phone_no  |gender|age |
+----------+----------+-------+------------+------------------------------+----------------+----------+----+----------+----------+------+----+
|4231262   |12/12/2023|Tuesday|12/13/2023  |Vellore-Bangalore (Bengaluru) |sub/sector_route|1         |592 |73006237  |9514576721|M     |35  |
|4231263   |12/12/2023|Tuesday|12/13/2023  |Delhi-Lucknow                 |primary         |1         |499 |51671687  |7908882499|M     |NULL|
|4231264   |12/12/2023|Tuesday|12/13/2023  |Hyderabad-Guntur              |primary         |1         |522 |74522222  |9398969525|F     |33  |
|4231272   |12/12/2023|Tuesday|12/13/2023  |Tirupati-Bangalore (Bengaluru)|sub/sector_route|1         |499 |48241626  |9985998594|M     |34  |