# Spark Datatypes:
- integer type (int)
- longtype  (long) example 1000000000 (bigint in sql)
- floattype (float)
- doubletype (float) example 3.1356334643 
- stringtype (string)
- datetype (datetime.date)
- timestamptype (datetime.datetime)
- arraytype (list tuple array)
- maptype (dict)

We define spark dataframe scheama using spark datatypes.

schema in spark can be defined in 2 ways 
- implicit (using infer schema)
- explicit using structtype and structfield to manually define each column type

In [18]:
from os import path
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType, TimestampType

if __name__ == "__main__":
    
    spark = (
        SparkSession.builder
        .appName("Spark schema infer")
        .master("local[2]")
        .getOrCreate()
    )

    spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

    schema = StructType([
        StructField("booking_id", StringType(), True),	
        StructField("doj_raw", StringType(), True),
        StructField("doj", DateType(), True),
        StructField("day", StringType(), True),
        StructField("booking_date_raw", StringType(), True),
        StructField("booking_date", DateType(), True),	
        StructField("booking_route", StringType(), True),	
        StructField("route_type", StringType(), True),	
        StructField("seat_count", IntegerType(), True),	
        StructField("fare", IntegerType(), True),	
        StructField("ry_user_id", StringType(), True),	
        StructField("phone_no", StringType(), True),	
        StructField("gender", StringType(), True),	
        StructField("age", IntegerType(), True)
    ])

    spark_df = (
        spark.read
        .format("csv")
        .option("header","true")
        .load(
            path = r"C:\Users\shubh\OneDrive\Desktop\validating data.csv",
            encoding = 'utf-8', 
            schema = schema
            )
    )

    spark_df.show(5)
    spark_df.printSchema()

+----------+----------+----------+-------+----------------+------------+--------------------+----------------+----------+----+----------+----------+------+----+
|booking_id|   doj_raw|       doj|    day|booking_date_raw|booking_date|       booking_route|      route_type|seat_count|fare|ry_user_id|  phone_no|gender| age|
+----------+----------+----------+-------+----------------+------------+--------------------+----------------+----------+----+----------+----------+------+----+
|   4231262|12/12/2023|2023-12-12|Tuesday|      12/13/2023|  2023-12-13|Vellore-Bangalore...|sub/sector_route|         1| 592|  73006237|9514576721|     M|  35|
|   4231263|12/12/2023|2023-12-12|Tuesday|      12/13/2023|  2023-12-13|       Delhi-Lucknow|         primary|         1| 499|  51671687|7908882499|     M|NULL|
|   4231264|12/12/2023|2023-12-12|Tuesday|      12/13/2023|  2023-12-13|    Hyderabad-Guntur|         primary|         1| 522|  74522222|9398969525|     F|  33|
|   4231272|12/12/2023|2023-12-12|