# Schemata anwenden

In [16]:
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("schemata")
         .master("local[4]")
         .config("spark.dynamicAllocation.enabled", "false")
         .config("spark.sql.adaptive.enabled", "false")
         .getOrCreate()
        )
sc = spark.sparkContext
spark

23/08/28 17:22:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
yellow_taxi_df = spark.read.option("header", True).csv("YellowTaxis_202210.csv.gz")
yellow_taxi_df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



In Spark UI checken, dass ein Job erzeugt wurde. Aber eigentlich ist das Lesen doch eine Transformation?
Wieso dann ein Job?
Was zum SQL/DataFrame Tab in Spark UI sagen
darauf stoßen, dass die Datentypen nocht niht korrekt sind.


In [4]:
yellow_taxi_df = spark.read.option("header", True).option("inferSchema", True).csv("YellowTaxis_202210.csv.gz")
yellow_taxi_df.printSchema()

[Stage 2:>                                                          (0 + 1) / 1]

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



                                                                                

Pro/Cons automatisches erkennen des Schemas


Pro

gut während der Entwicklung um sich mit den Datenvertraut zu machen

Cons
Fehler in den Daten führen zu falschem Schema
dauert lange

Für produktive Spark Applikatione ist es deswegen empfohlen Schemata manuell anzugeben



In [5]:
from pyspark.sql.types import *

In [6]:
yellow_taxi_schema  =  (  StructType
                        ([ 
                            StructField("VendorId"               , IntegerType()   , True),
                            StructField("lpep_pickup_datetime"   , TimestampType() , True),
                            StructField("lpep_dropoff_datetime"  , TimestampType() , True),
                            StructField("passenger_count"        , DoubleType()    , True),
                            StructField("trip_distance"          , DoubleType()    , True),
                            StructField("RatecodeID"             , DoubleType()    , True),
                            StructField("store_and_fwd_flag"     , StringType()    , True),
                            StructField("PULocationID"           , IntegerType()   , True),
                            StructField("DOLocationID"           , IntegerType()   , True),
                            StructField("payment_type"           , IntegerType()   , True),
                            StructField("fare_amount"            , DoubleType()    , True),
                            StructField("extra"                  , DoubleType()    , True),
                            StructField("mta_tax"                , DoubleType()    , True),
                            StructField("tip_amount"             , DoubleType()    , True),
                            StructField("tolls_amount"           , DoubleType()    , True),
                            StructField("improvement_surcharge"  , DoubleType()    , True),
                            StructField("total_amount"           , DoubleType()    , True),
                            StructField("congestion_surcharge"   , DoubleType()    , True),
                            StructField("airport_fee"            , DoubleType()    , True)
                        ])
                       )

In [7]:
yellow_taxi_df = spark.read.option("header", True).schema(yellow_taxi_schema).csv("YellowTaxis_202210.csv.gz")

In [8]:
yellow_taxi_df.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



## Nun mit einem Json File

In [9]:
taxi_bases_df = spark.read.json("TaxiBases.json")

In [10]:
taxi_bases_df.show()

AnalysisException: Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only include the internal corrupt record column
(named _corrupt_record by default). For example:
spark.read.schema(schema).csv(file).filter($"_corrupt_record".isNotNull).count()
and spark.read.schema(schema).csv(file).select("_corrupt_record").show().
Instead, you can cache or save the parsed results and then send the same query.
For example, val df = spark.read.schema(schema).csv(file).cache() and then
df.filter($"_corrupt_record".isNotNull).count().

In [11]:
taxi_bases_df = spark.read.option("multiline", True).json("TaxiBases.json")

In [12]:
taxi_bases_df.show(3, truncate=False)

+--------------------------------------------+----------+-------------------------------+------------------------------------------------+--------------+------------+----------------+--------+---------------------------+
|Address                                     |Date      |Entity Name                    |GeoLocation                                     |License Number|SHL Endorsed|Telephone Number|Time    |Type of Base               |
+--------------------------------------------+----------+-------------------------------+------------------------------------------------+--------------+------------+----------------+--------+---------------------------+
|{636, NEW YORK, 10001, NY, WEST   28 STREET}|08/15/2019|VIER-NY,LLC                    |{40.75273, (40.75273, -74.006408), -74.006408}  |B02865        |No          |6466657536      |18:03:31|BLACK CAR BASE             |
|{131, BRONX, 10468, NY, KINGSBRIDGE ROAD}   |08/15/2019|VETERANS RADIO DISPATCHER CORP.|{40.86927, (40.86927, -73.9

In [13]:
taxi_bases_df.printSchema()

root
 |-- Address: struct (nullable = true)
 |    |-- Building: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- Postcode: long (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- Street: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Entity Name: string (nullable = true)
 |-- GeoLocation: struct (nullable = true)
 |    |-- Latitude: double (nullable = true)
 |    |-- Location: string (nullable = true)
 |    |-- Longitude: double (nullable = true)
 |-- License Number: string (nullable = true)
 |-- SHL Endorsed: string (nullable = true)
 |-- Telephone Number: long (nullable = true)
 |-- Time: string (nullable = true)
 |-- Type of Base: string (nullable = true)



In [14]:
taxi_bases_schema = (
                    StructType
                    ([
                        StructField("License Number"         , StringType()    , True),
                        StructField("Entity Name"            , StringType()    , True),
                        StructField("Telephone Number"       , LongType()      , True),
                        StructField("SHL Endorsed"           , StringType()    , True),
                        StructField("Type of Base"           , StringType()    , True),

                        StructField("Address", 
                                        StructType
                                        ([
                                            StructField("Building"   , StringType(),   True),
                                            StructField("Street"     , StringType(),   True), 
                                            StructField("City"       , StringType(),   True), 
                                            StructField("State"      , StringType(),   True), 
                                            StructField("Postcode"   , StringType(),   True)
                                        ]),
                                    True
                                   ),
                        
                        StructField("GeoLocation", 
                                        StructType
                                        ([
                                            StructField("Latitude"   , StringType(),   True),
                                            StructField("Longitude"  , StringType(),   True), 
                                            StructField("Location"   , StringType(),   True)
                                        ]),
                                    True
                                   )  
                  ])
                )

In [15]:
taxi_bases_df = spark.read.option("multiline", True).schema(taxi_bases_schema).json("TaxiBases.json")
taxi_bases_df.show(truncate=False)

+--------------+--------------------------------------+----------------+------------+---------------------------+-----------------------------------------------------------+------------------------------------------------+
|License Number|Entity Name                           |Telephone Number|SHL Endorsed|Type of Base               |Address                                                    |GeoLocation                                     |
+--------------+--------------------------------------+----------------+------------+---------------------------+-----------------------------------------------------------+------------------------------------------------+
|B02865        |VIER-NY,LLC                           |6466657536      |No          |BLACK CAR BASE             |{636, WEST   28 STREET, NEW YORK, NY, 10001}               |{40.75273, -74.006408, (40.75273, -74.006408)}  |
|B02634        |VETERANS RADIO DISPATCHER CORP.       |7183647878      |No          |LIVERY BASE            

Was nimmst Du aus dieser Lektion mit?

Du kannst ein Schema automatisch erkennen, aber das dauert teils lange und kann auch zu nicht korrekten Ergebnissen führen. Deswegen ist es in der Praxis empfehleungswert Schemata manuell zu erzeugen und anzugeben.
