# Erzeugen von Dataframes

* aus einem RDD oder einer beliebigen Python-Collections
* aus anderen Dataframes
* aus Dateien/Datenbanken

Wir werden uns jede dieser Möglichkeiten anschauen.

## Initialisieren einer Sparksession

In [22]:
from pyspark.sql import SparkSession

from pyspark.sql.types import *
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("first-steps-with-dataframes").master("local[4]").getOrCreate()
sc = spark.sparkContext

In [23]:
spark

In [40]:
from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

## Erzeugen aus einem RDD

In [25]:
data = [
    [1, "Oliver", 10],
    [2, "Xiaofei", 20],
    [3, "Marc", 40]
]

dozenten_rdd = sc.parallelize(data)

In [4]:
dozenten_df = dozenten_rdd.toDF()

                                                                                

In [7]:
type(dozenten_rdd)

pyspark.rdd.RDD

In [8]:
dozenten_df.show()

+---+-------+---+
| _1|     _2| _3|
+---+-------+---+
|  1| Oliver| 10|
|  2|Xiaofei| 20|
|  3|   Marc| 40|
+---+-------+---+



                                                                                

In [26]:
dozenten_df = dozenten_df.toDF("ID", "Name", "Points")

In [27]:
dozenten_df.show()

+---+-------+------+
| ID|   Name|Points|
+---+-------+------+
|  1| Oliver|    10|
|  2|Xiaofei|    20|
|  3|   Marc|    40|
+---+-------+------+



In [28]:
dozenten_df.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Points: long (nullable = true)



## Erzeuge Dataframe aus einer Collection

In [15]:
dozenten_df = (
    spark.createDataFrame(
        data,
        "Id: long, Name: string, Pointy: long"
    )
)

In [16]:
dozenten_df.show()

+---+-------+------+
| Id|   Name|Pointy|
+---+-------+------+
|  1| Oliver|    10|
|  2|Xiaofei|    20|
|  3|   Marc|    40|
+---+-------+------+



In [17]:
dozenten_df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Pointy: long (nullable = true)



In [18]:
type(spark)

pyspark.sql.session.SparkSession

### Erzeugen aus einer Datei

In [30]:
yellow_taxis_df = spark.read.csv("YellowTaxis_202210.csv.gz")

In [31]:
yellow_taxis_df.show(4)

+--------+--------------------+--------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+--------------------+------------+--------------------+-----------+
|     _c0|                 _c1|                 _c2|            _c3|          _c4|       _c5|               _c6|         _c7|         _c8|         _c9|       _c10| _c11|   _c12|      _c13|        _c14|                _c15|        _c16|                _c17|       _c18|
+--------+--------------------+--------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+--------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_date...|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls

In [32]:
yellow_taxis_df = spark.read.option("header", True).csv("YellowTaxis_202210.csv.gz")

In [33]:
yellow_taxis_df.show(4)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1|2022-10-01T05:33:...| 2022-10-01T05:48:...|            1.0|          1.7|       1.0|                 N|         249|         107|           1|        9.5|  3.0|    0.5|      2.6

In [34]:
yellow_taxis_df.count()

                                                                                

3675412

In [21]:
yellow_taxis_df.show(2)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1|2022-10-01T05:33:...| 2022-10-01T05:48:...|            1.0|          1.7|       1.0|                 N|         249|         107|           1|        9.5|  3.0|    0.5|      2.6

In [35]:
yellow_taxis_df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



*Nun aus einm TSV*

In [38]:
green_taxi_df = (
    spark.read
    .option("header", "true")
    .option("delimiter", "\t")
    .csv("GreenTaxis_202210.csv")
)

In [39]:
green_taxi_df.show(4)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2|2022-10-01T06:08:...| 2022-10-01T06:21:...|            1.0|         2.47|       1.0|                 N|         256|         225|         1.0|       11.5|  0.5|    0.5|      2.5

*Nun ein Json-File*

In [42]:
%less PaymentTypes.json

{"PaymentTypeID":1,"PaymentType":"Credit Card"}
{"PaymentTypeID":2,"PaymentType":"Cash"}
{"PaymentTypeID":3,"PaymentType":"No Charge"}
{"PaymentTypeID":4,"PaymentType":"Dispute"}
{"PaymentTypeID":5,"PaymentType":"Unknown"}
{"PaymentTypeID":6,"PaymentType":"Voided Trip"}


In [47]:
paymen_types_df = (
    spark.read.json("PaymentTypes.json")
)

In [49]:
paymen_types_df.show()

+-----------+-------------+
|PaymentType|PaymentTypeID|
+-----------+-------------+
|Credit Card|            1|
|       Cash|            2|
|  No Charge|            3|
|    Dispute|            4|
|    Unknown|            5|
|Voided Trip|            6|
+-----------+-------------+



# DAS kann man wohl löschen wenn es schon durch die anderen Kapitel abgedeckt ist

## Qualitätschecks

 * Entferne nicht vollständige Datensätze
 * Entferne Dublikate
 * Entferne nicht mehr gültige Daten
   

In [22]:
yellow_taxis_df.take(2)

[Row(VendorID='1', tpep_pickup_datetime='2022-10-01T05:33:41.000+05:30', tpep_dropoff_datetime='2022-10-01T05:48:39.000+05:30', passenger_count='1.0', trip_distance='1.7', RatecodeID='1.0', store_and_fwd_flag='N', PULocationID='249', DOLocationID='107', payment_type='1', fare_amount='9.5', extra='3.0', mta_tax='0.5', tip_amount='2.65', tolls_amount='0.0', improvement_surcharge='0.3', total_amount='15.95', congestion_surcharge='2.5', airport_fee='0.0'),
 Row(VendorID='2', tpep_pickup_datetime='2022-10-01T05:44:30.000+05:30', tpep_dropoff_datetime='2022-10-01T05:49:48.000+05:30', passenger_count='2.0', trip_distance='0.72', RatecodeID='1.0', store_and_fwd_flag='N', PULocationID='151', DOLocationID='238', payment_type='2', fare_amount='5.5', extra='0.5', mta_tax='0.5', tip_amount='0.0', tolls_amount='0.0', improvement_surcharge='0.3', total_amount='9.3', congestion_surcharge='2.5', airport_fee='0.0')]

In [23]:
yellow_taxis_analyzed_df = (
    yellow_taxis_df.describe("passenger_count", "trip_distance")
)

In [24]:
yellow_taxis_analyzed_df.show()

[Stage 12:>                                                         (0 + 1) / 1]

+-------+------------------+-----------------+
|summary|   passenger_count|    trip_distance|
+-------+------------------+-----------------+
|  count|           3542392|          3675412|
|   mean|1.3846934500755421|6.206976298167039|
| stddev|0.9302303297406955|640.8236808320255|
|    min|               0.0|              0.0|
|    max|               9.0|            99.89|
+-------+------------------+-----------------+



                                                                                

In [34]:
yellow_taxis_df = (
    yellow_taxis_df.where("passenger_count > 0 ").filter(col("trip_distance") > 0.0)
)

In [35]:
yellow_taxis_df.describe("passenger_count", "trip_distance").show()

[Stage 16:>                                                         (0 + 1) / 1]

+-------+------------------+------------------+
|summary|   passenger_count|     trip_distance|
+-------+------------------+------------------+
|  count|           3422296|           3422296|
|   mean|1.4132877460044369|3.6868047912858457|
| stddev|0.9196558392175687| 22.54992281754558|
|    min|               1.0|              0.01|
|    max|               9.0|             99.89|
+-------+------------------+------------------+



                                                                                

In [59]:
yellow_taxis_df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



In [61]:
yellow_taxis_df.describe("tpep_pickup_datetime", "tpep_dropoff_datetime").show()

[Stage 40:>                                                         (0 + 1) / 1]

+-------+--------------------+---------------------+
|summary|tpep_pickup_datetime|tpep_dropoff_datetime|
+-------+--------------------+---------------------+
|  count|             3675412|              3675412|
|   mean|                null|                 null|
| stddev|                null|                 null|
|    min|2009-01-01T04:32:...| 2009-01-01T07:50:...|
|    max|2022-11-01T06:57:...| 2022-11-03T22:56:...|
+-------+--------------------+---------------------+



                                                                                

In [36]:
yellow_taxis_df.count()

                                                                                

3422296

### Drop Null Einträge

In [37]:
print("Vor dem droppen: " + str(yellow_taxis_df.count()))

[Stage 20:>                                                         (0 + 1) / 1]

Vor dem droppen: 3422296


                                                                                

In [38]:
yellow_taxis_df = yellow_taxis_df.na.drop('all')

In [39]:
print("Nach dem droppen: " + str(yellow_taxis_df.count()))

[Stage 22:>                                                         (0 + 1) / 1]

Nach dem droppen: 3422296


                                                                                

In [40]:
default_value_map = {
    'payment_type': 5,
    'RateCodeID': 1
}

In [41]:
yellow_taxis_df = yellow_taxis_df.na.fill(default_value_map)

In [42]:
yellow_taxis_df.count()

                                                                                

3422296

In [43]:
print("Before operation = " + str(yellow_taxis_df.count()))

[Stage 26:>                                                         (0 + 1) / 1]

Before operation = 3422296


                                                                                

In [44]:
yellow_taxis_df = yellow_taxis_df.dropDuplicates()

In [45]:
print("After operation = " + str(yellow_taxis_df.count()))



After operation = 3422295


                                                                                

### Schrenke den Zeitraum ein

In [46]:
print( "Before operation = " + str( yellow_taxis_df.count()))



Before operation = 3422295


                                                                                

In [47]:
yellow_taxis_df.show(2)

[Stage 34:>                                                         (0 + 1) / 1]

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2|2022-10-11T18:28:...| 2022-10-11T18:58:...|            2.0|        13.55|       3.0|                 N|         158|           1|           2|        0.0|  0.0|    0.0|       0.

                                                                                

In [48]:
yellow_taxis_df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = false)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



In [49]:
yellow_taxis_df = yellow_taxis_df.where("lpep_pickup_datetime >= '2022-10-01' AND lpep_dropoff_datetime < '2022-11-01'")

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `lpep_pickup_datetime` cannot be resolved. Did you mean one of the following? [`tpep_pickup_datetime`, `tpep_dropoff_datetime`, `trip_distance`, `airport_fee`, `payment_type`].; line 1 pos 0;
'Filter (('lpep_pickup_datetime >= 2022-10-01) AND ('lpep_dropoff_datetime < 2022-11-01))
+- Deduplicate [DOLocationID#82, improvement_surcharge#89, tpep_dropoff_datetime#76, PULocationID#81, trip_distance#78, tolls_amount#88, RatecodeID#1568, VendorID#74, tip_amount#87, payment_type#1569, fare_amount#84, passenger_count#77, store_and_fwd_flag#80, extra#85, airport_fee#92, congestion_surcharge#91, total_amount#90, tpep_pickup_datetime#75, mta_tax#86]
   +- Project [VendorID#74, tpep_pickup_datetime#75, tpep_dropoff_datetime#76, passenger_count#77, trip_distance#78, coalesce(RatecodeID#79, cast(1 as string)) AS RatecodeID#1568, store_and_fwd_flag#80, PULocationID#81, DOLocationID#82, coalesce(payment_type#83, cast(5 as string)) AS payment_type#1569, fare_amount#84, extra#85, mta_tax#86, tip_amount#87, tolls_amount#88, improvement_surcharge#89, total_amount#90, congestion_surcharge#91, airport_fee#92]
      +- Filter atleastnnonnulls(1, VendorID#74, tpep_pickup_datetime#75, tpep_dropoff_datetime#76, passenger_count#77, trip_distance#78, RatecodeID#79, store_and_fwd_flag#80, PULocationID#81, DOLocationID#82, payment_type#83, fare_amount#84, extra#85, mta_tax#86, tip_amount#87, tolls_amount#88, improvement_surcharge#89, total_amount#90, congestion_surcharge#91, airport_fee#92)
         +- Filter (cast(trip_distance#78 as double) > 0.0)
            +- Filter (cast(passenger_count#77 as int) > 0)
               +- Relation [VendorID#74,tpep_pickup_datetime#75,tpep_dropoff_datetime#76,passenger_count#77,trip_distance#78,RatecodeID#79,store_and_fwd_flag#80,PULocationID#81,DOLocationID#82,payment_type#83,fare_amount#84,extra#85,mta_tax#86,tip_amount#87,tolls_amount#88,improvement_surcharge#89,total_amount#90,congestion_surcharge#91,airport_fee#92] csv


In [62]:
yellow_taxis_df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



In [63]:
tmp = yellow_taxis_df.select("VendorId")

In [64]:
tmp.show(4)

+--------+
|VendorId|
+--------+
|       1|
|       2|
|       2|
|       1|
+--------+
only showing top 4 rows



In [75]:
tmp = yellow_taxis_df.select("VendorID", col("passenger_count").cast(IntegerType()),   col("trip_distance").alias("Tripper"))

In [76]:
tmp.show(2)

+--------+---------------+-------+
|VendorID|passenger_count|Tripper|
+--------+---------------+-------+
|       1|              1|    1.7|
|       2|              2|   0.72|
+--------+---------------+-------+
only showing top 2 rows



In [77]:
yellow_taxis_df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



In [78]:
yellow_taxis_df.drop("airport_fee").printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)



In [79]:
yellow_taxis_df.explain(mode="extended")

== Parsed Logical Plan ==
Relation [VendorID#1970,tpep_pickup_datetime#1971,tpep_dropoff_datetime#1972,passenger_count#1973,trip_distance#1974,RatecodeID#1975,store_and_fwd_flag#1976,PULocationID#1977,DOLocationID#1978,payment_type#1979,fare_amount#1980,extra#1981,mta_tax#1982,tip_amount#1983,tolls_amount#1984,improvement_surcharge#1985,total_amount#1986,congestion_surcharge#1987,airport_fee#1988] csv

== Analyzed Logical Plan ==
VendorID: string, tpep_pickup_datetime: string, tpep_dropoff_datetime: string, passenger_count: string, trip_distance: string, RatecodeID: string, store_and_fwd_flag: string, PULocationID: string, DOLocationID: string, payment_type: string, fare_amount: string, extra: string, mta_tax: string, tip_amount: string, tolls_amount: string, improvement_surcharge: string, total_amount: string, congestion_surcharge: string, airport_fee: string
Relation [VendorID#1970,tpep_pickup_datetime#1971,tpep_dropoff_datetime#1972,passenger_count#1973,trip_distance#1974,RatecodeID

In [80]:
yellow_taxis_df.explain(mode="cost")

== Optimized Logical Plan ==
Relation [VendorID#1970,tpep_pickup_datetime#1971,tpep_dropoff_datetime#1972,passenger_count#1973,trip_distance#1974,RatecodeID#1975,store_and_fwd_flag#1976,PULocationID#1977,DOLocationID#1978,payment_type#1979,fare_amount#1980,extra#1981,mta_tax#1982,tip_amount#1983,tolls_amount#1984,improvement_surcharge#1985,total_amount#1986,congestion_surcharge#1987,airport_fee#1988] csv, Statistics(sizeInBytes=71.7 MiB)

== Physical Plan ==
FileScan csv [VendorID#1970,tpep_pickup_datetime#1971,tpep_dropoff_datetime#1972,passenger_count#1973,trip_distance#1974,RatecodeID#1975,store_and_fwd_flag#1976,PULocationID#1977,DOLocationID#1978,payment_type#1979,fare_amount#1980,extra#1981,mta_tax#1982,tip_amount#1983,tolls_amount#1984,improvement_surcharge#1985,total_amount#1986,congestion_surcharge#1987,airport_fee#1988] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/pupil/spark-course/course/02-Dataframes/YellowTaxis_202210...., 

In [81]:
less RateCodes.json

{"RateCodeID":1,"RateCode":"Standard rate"}
{"RateCodeID":2,"RateCode":"JFK"}
{"RateCodeID":3,"RateCode":"Newark"}
{"RateCodeID":},"RateCode":"Westchester"}
{"RateCodeID":5,"RateCode":"Negotiated fare"}
{"RateCodeID":6,"RateCode":"Group ride"}



In [83]:
rate_codes_csv = (
    spark.read.option("header", "true").option("mode", "PERMISSIVE").csv("RateCodes.csv")
)

In [85]:
rate_codes_csv.show(truncate=False)

+-----------+---------------+----------+
|RateCodeID |RateCode       |IsApproved|
+-----------+---------------+----------+
|1          |Standard Rate  |Yes       |
|2          |JFK            |Yes       |
|3          |Newark         |Yes       |
|4          |null           |null      |
|Westchester|Yes            |null      |
|5          |Negotiated fare|null      |
|6          |GroupRide      |Yes       |
+-----------+---------------+----------+



In [89]:
yellow_taxis_df.coalesce(4)

DataFrame[VendorID: string, tpep_pickup_datetime: string, tpep_dropoff_datetime: string, passenger_count: string, trip_distance: string, RatecodeID: string, store_and_fwd_flag: string, PULocationID: string, DOLocationID: string, payment_type: string, fare_amount: string, extra: string, mta_tax: string, tip_amount: string, tolls_amount: string, improvement_surcharge: string, total_amount: string, congestion_surcharge: string, airport_fee: string]

In [90]:
yellow_taxis_df.rdd.getNumPartitions()

1

In [97]:
yellow_taxis_df = (
    spark.read.option("header", "true").option("mode", "PERMISSIVE").csv("YellowTaxis_202210.csv.gz")
)

In [98]:
yellow_taxis_df.count()

                                                                                

3675412

In [99]:
yellow_taxis_df.rdd.getNumPartitions()

1

In [106]:
yellow_taxis_df = yellow_taxis_df.repartition(100)

In [107]:
yellow_taxis_df.rdd.getNumPartitions()

100

In [108]:
yellow_taxis_df.write.option("header", "true").option("dateFormat", "yyyy-MM-dd HH:mm:ss.S").mode("overwrite").csv("bla.csv")

                                                                                

In [109]:
yellow_taxis_df.write.option("header", True).option("dateFormat", "yyyy-MM-dd HH:mm:ss.S").mode("overwrite").parquet("foo.parquet")

                                                                                