# Dataframe Deep Dive (Part 1)

In [1]:
spark

In [2]:
sc

## Dataframe `Schema`

#### With inferSchema = `True`

In [3]:
data_set = 's3://fcc-spark-example/dataset/2023/orders.csv'

df = (spark.read
           .format('csv')
           .option('header', 'true')
           .option('inferSchema', 'true')
           .load(data_set)
     )

                                                                                

In [4]:
df.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

In [5]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



#### With `inferSchema` = `True` and `samplingRatio` = `<some ratio>`

In [6]:
data_set = 's3://fcc-spark-example/dataset/2023/orders.csv'

df = (spark.read
           .format('csv')
           .option('header', 'true')
           .option('inferSchema', 'true')
           .option('samplingRatio', 0.01)
           .load(data_set)
     )

                                                                                

In [7]:
df.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

#### Enforcing the `schema`

In [8]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

In [9]:
df_without_schema = (spark.read
                           .format('csv')
                           .load(data_set)
                     )

In [10]:
df_without_schema.show()

+---+--------------------+-----+---------------+
|_c0|                 _c1|  _c2|            _c3|
+---+--------------------+-----+---------------+
|  1|2013-07-25T00:00:...|11599|         CLOSED|
|  2|2013-07-25T00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25T00:00:...|12111|       COMPLETE|
|  4|2013-07-25T00:00:...| 8827|         CLOSED|
|  5|2013-07-25T00:00:...|11318|       COMPLETE|
|  6|2013-07-25T00:00:...| 7130|       COMPLETE|
|  7|2013-07-25T00:00:...| 4530|       COMPLETE|
|  8|2013-07-25T00:00:...| 2911|     PROCESSING|
|  9|2013-07-25T00:00:...| 5657|PENDING_PAYMENT|
| 10|2013-07-25T00:00:...| 5648|PENDING_PAYMENT|
| 11|2013-07-25T00:00:...|  918| PAYMENT_REVIEW|
| 12|2013-07-25T00:00:...| 1837|         CLOSED|
| 13|2013-07-25T00:00:...| 9149|PENDING_PAYMENT|
| 14|2013-07-25T00:00:...| 9842|     PROCESSING|
| 15|2013-07-25T00:00:...| 2568|       COMPLETE|
| 16|2013-07-25T00:00:...| 7276|PENDING_PAYMENT|
| 17|2013-07-25T00:00:...| 2667|       COMPLETE|
| 18|2013-07-25T00:0

In [11]:
df_without_schema.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



There are 2 ways we can define the schema : 

#### Method 1

In [12]:
orders_schema = 'order_id long, order_date date, order_customer_id long, order_status string'

df_with_schema = (spark.read
                           .format('csv')
                           .schema(orders_schema)
                           .load(data_set)
                     )

In [13]:
df_with_schema.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       CO

In [14]:
df_with_schema.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [15]:
# What if we load some schema which doesnt match with the data 
# It will full the columns with NULLs 

orders_schema = 'order_id long, order_date date, order_customer_id long, order_status long'

df_with_schema = (spark.read
                           .format('csv')
                           .schema(orders_schema)
                           .load(data_set)
                     )

In [16]:
df_with_schema.show()

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|       1|2013-07-25|            11599|        null|
|       2|2013-07-25|              256|        null|
|       3|2013-07-25|            12111|        null|
|       4|2013-07-25|             8827|        null|
|       5|2013-07-25|            11318|        null|
|       6|2013-07-25|             7130|        null|
|       7|2013-07-25|             4530|        null|
|       8|2013-07-25|             2911|        null|
|       9|2013-07-25|             5657|        null|
|      10|2013-07-25|             5648|        null|
|      11|2013-07-25|              918|        null|
|      12|2013-07-25|             1837|        null|
|      13|2013-07-25|             9149|        null|
|      14|2013-07-25|             9842|        null|
|      15|2013-07-25|             2568|        null|
|      16|2013-07-25|             7276|       

#### Method 2 (using `StructType`)

In [17]:
from pyspark.sql.types import *

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

orders_schema = StructType([
                            StructField('order_id', LongType()),
                            StructField('order_date', DateType()),
                            StructField('order_customer_id', IntegerType()),
                            StructField('order_status', StringType())
                        ])

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

df.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       CO

In [18]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [19]:
from pyspark.sql.types import *

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

orders_schema = StructType([
                            StructField('order_id', LongType(), False),
                            StructField('order_date', DateType(), False),
                            StructField('order_customer_id', IntegerType(), False),
                            StructField('order_status', StringType(), False)
                        ])

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

df.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       CO

In PySpark, the nullable parameter in the schema definition using StructType does not enforce non-nullability in the DataFrame. It is primarily used as a hint for optimization purposes and does not restrict the presence of null values in the columns.

Even if you set nullable=False for the fields in the schema definition, it does not guarantee that the corresponding columns in the DataFrame will not contain null values.



In [20]:
import pyspark.sql.functions as F

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', LongType(), nullable=False))
                 .add(StructField('order_date', DateType(), nullable=False))
                 .add(StructField('order_customer_id', IntegerType(), nullable=False))
                 .add(StructField('order_status', StringType(), nullable=False))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )


df_filtered = df.filter(F.col('order_date').isNull())
df_filtered.show()

+--------+----------+-----------------+------------+
|order_id|order_date|order_customer_id|order_status|
+--------+----------+-----------------+------------+
|   68884|      null|             null|    COMPLETE|
+--------+----------+-----------------+------------+



### `Date` Column

In [21]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', LongType()))
                 .add(StructField('order_date', DateType()))
                 .add(StructField('order_customer_id', IntegerType()))
                 .add(StructField('order_status', StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )


In [22]:
# This will error out  
#df.show()

In [23]:
df = (spark.read
           .format('csv')
           .option('inferSchema', True)
           .load(data_set)
     )

In [24]:
df.show()

+---+----------+-----+---------------+
|_c0|       _c1|  _c2|            _c3|
+---+----------+-----+---------------+
|  1|07-25-2013|11599|         CLOSED|
|  2|07-25-2013|  256|PENDING_PAYMENT|
|  3|07-25-2013|12111|       COMPLETE|
|  4|07-25-2013| 8827|         CLOSED|
|  5|07-25-2013|11318|       COMPLETE|
|  6|07-25-2013| 7130|       COMPLETE|
|  7|07-25-2013| 4530|       COMPLETE|
|  8|07-25-2013| 2911|     PROCESSING|
|  9|07-25-2013| 5657|PENDING_PAYMENT|
| 10|07-25-2013| 5648|PENDING_PAYMENT|
| 11|07-25-2013|  918| PAYMENT_REVIEW|
| 12|07-25-2013| 1837|         CLOSED|
| 13|07-25-2013| 9149|PENDING_PAYMENT|
| 14|07-25-2013| 9842|     PROCESSING|
| 15|07-25-2013| 2568|       COMPLETE|
| 16|07-25-2013| 7276|PENDING_PAYMENT|
| 17|07-25-2013| 2667|       COMPLETE|
| 18|07-25-2013| 1205|         CLOSED|
| 19|07-25-2013| 9488|PENDING_PAYMENT|
| 20|07-25-2013| 9198|     PROCESSING|
+---+----------+-----+---------------+
only showing top 20 rows



In [25]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)



Two ways to deal with this:
    
    - Load using `String` and later on change it 
    - Somehow inform Spark about the exact format of the date schema

#### Load using `String`

In [26]:
# Step 1 : Load using StringType

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', LongType()))
                 .add(StructField('order_date', StringType()))
                 .add(StructField('order_customer_id', IntegerType()))
                 .add(StructField('order_status', StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

df.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|07-25-2013|            11599|         CLOSED|
|       2|07-25-2013|              256|PENDING_PAYMENT|
|       3|07-25-2013|            12111|       COMPLETE|
|       4|07-25-2013|             8827|         CLOSED|
|       5|07-25-2013|            11318|       COMPLETE|
|       6|07-25-2013|             7130|       COMPLETE|
|       7|07-25-2013|             4530|       COMPLETE|
|       8|07-25-2013|             2911|     PROCESSING|
|       9|07-25-2013|             5657|PENDING_PAYMENT|
|      10|07-25-2013|             5648|PENDING_PAYMENT|
|      11|07-25-2013|              918| PAYMENT_REVIEW|
|      12|07-25-2013|             1837|         CLOSED|
|      13|07-25-2013|             9149|PENDING_PAYMENT|
|      14|07-25-2013|             9842|     PROCESSING|
|      15|07-25-2013|             2568|       CO

In [27]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [28]:
# Step 2 : Perform the transformation to convert the data into the right format 

import pyspark.sql.functions as F

df_transformed = df.withColumn('order_date', F.to_date(F.col('order_date'), "MM-dd-yyyy"))

In [29]:
df_transformed.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       CO

In [30]:
# Old df with string dataType
df.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|07-25-2013|            11599|         CLOSED|
|       2|07-25-2013|              256|PENDING_PAYMENT|
|       3|07-25-2013|            12111|       COMPLETE|
|       4|07-25-2013|             8827|         CLOSED|
|       5|07-25-2013|            11318|       COMPLETE|
|       6|07-25-2013|             7130|       COMPLETE|
|       7|07-25-2013|             4530|       COMPLETE|
|       8|07-25-2013|             2911|     PROCESSING|
|       9|07-25-2013|             5657|PENDING_PAYMENT|
|      10|07-25-2013|             5648|PENDING_PAYMENT|
|      11|07-25-2013|              918| PAYMENT_REVIEW|
|      12|07-25-2013|             1837|         CLOSED|
|      13|07-25-2013|             9149|PENDING_PAYMENT|
|      14|07-25-2013|             9842|     PROCESSING|
|      15|07-25-2013|             2568|       CO

In [31]:
# If we parse WRONG format 

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', LongType()))
                 .add(StructField('order_date', StringType()))
                 .add(StructField('order_customer_id', IntegerType()))
                 .add(StructField('order_status', StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

df_transformed = df.withColumn('order_date', F.to_date(F.col('order_date'), "dd-MM-yyyy"))

df_transformed.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|      null|            11599|         CLOSED|
|       2|      null|              256|PENDING_PAYMENT|
|       3|      null|            12111|       COMPLETE|
|       4|      null|             8827|         CLOSED|
|       5|      null|            11318|       COMPLETE|
|       6|      null|             7130|       COMPLETE|
|       7|      null|             4530|       COMPLETE|
|       8|      null|             2911|     PROCESSING|
|       9|      null|             5657|PENDING_PAYMENT|
|      10|      null|             5648|PENDING_PAYMENT|
|      11|      null|              918| PAYMENT_REVIEW|
|      12|      null|             1837|         CLOSED|
|      13|      null|             9149|PENDING_PAYMENT|
|      14|      null|             9842|     PROCESSING|
|      15|      null|             2568|       CO

#### Loading using `dateFormat`

In [32]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', LongType()))
                 .add(StructField('order_date', DateType()))
                 .add(StructField('order_customer_id', IntegerType()))
                 .add(StructField('order_status', StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('dateFormat', 'MM-dd-yyyy')
           .load(data_set)
     )

df.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             7130|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
|       8|2013-07-25|             2911|     PROCESSING|
|       9|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-25|             5648|PENDING_PAYMENT|
|      11|2013-07-25|              918| PAYMENT_REVIEW|
|      12|2013-07-25|             1837|         CLOSED|
|      13|2013-07-25|             9149|PENDING_PAYMENT|
|      14|2013-07-25|             9842|     PROCESSING|
|      15|2013-07-25|             2568|       CO


Similarly if the column has integers and string, and we load it as a IntegerType, we will get the data as NULL 

```python
+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|            aaaaa|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             bbbb|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
+--------+----------+-----------------+---------------+
```
We will get : 

```python
+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             null|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             null|       COMPLETE|
|       7|2013-07-25|             4530|       COMPLETE|
+--------+----------+-----------------+---------------+
```
   
We can change this behaviour and we will see next..

## Modes of reading data

When reading data with Spark, there are different modes available to handle corrupt or malformed records encountered during the read process. These modes determine how Spark should behave when it encounters such records.

- **Failfast mode:** In this mode (mode="failfast"), Spark fails immediately upon encountering any corrupt or malformed record. It throws an exception and stops the read operation. No data is returned. This mode is useful when you want to ensure data integrity and immediately identify any issues with the data.

- **Permissive mode:** Permissive mode (mode="permissive", which is the default) allows Spark to continue reading the data even if it encounters corrupt or malformed records. When a corrupt record is encountered, Spark tries to parse and load as much data as possible. It inserts null or NaN values for the corrupt fields and includes the malformed records in the resulting DataFrame. This mode is helpful when you want to handle corrupt records separately or perform additional error handling.

- **Dropmalformed mode:** Dropmalformed mode (mode="dropmalformed") instructs Spark to drop any records that cannot be parsed correctly. When a malformed record is encountered, Spark excludes it from the resulting DataFrame entirely. This mode is useful when you want to discard any records that do not conform to the expected schema or format.

#### Permissive mode

In [33]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', LongType()))
                 .add(StructField('order_date', StringType()))
                 .add(StructField('order_customer_id', IntegerType()))
                 .add(StructField('order_status', StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

In [34]:
df.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|   7/25/13|              256|PENDING_PAYMENT|
|       2|   7/25/13|            12111|       COMPLETE|
|       3|   7/25/13|             8827|         CLOSED|
|       4|   7/25/13|            11318|       COMPLETE|
|       5|   7/25/13|             null|       COMPLETE|
|       6|   7/25/13|             4530|       COMPLETE|
|       7|   7/25/13|             2911|     PROCESSING|
|       8|   7/25/13|             5657|PENDING_PAYMENT|
|       9|   7/25/13|             null|PENDING_PAYMENT|
|      10|   7/25/13|              918| PAYMENT_REVIEW|
|      11|   7/25/13|             1837|         CLOSED|
|      12|   7/25/13|             9149|PENDING_PAYMENT|
|      13|   7/25/13|             9842|     PROCESSING|
|      14|   7/25/13|             null|       COMPLETE|
|      15|   7/25/13|             7276|PENDING_P

#### Failfast mode

In [35]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', LongType()))
                 .add(StructField('order_date', StringType()))
                 .add(StructField('order_customer_id', IntegerType()))
                 .add(StructField('order_status', StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('mode', 'failfast')
           .load(data_set)
     )

In [36]:
# # This will error out
# df.show()

#### Dropmalformed mode

In [37]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', LongType()))
                 .add(StructField('order_date', StringType()))
                 .add(StructField('order_customer_id', IntegerType()))
                 .add(StructField('order_status', StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('mode', 'dropmalformed')
           .load(data_set)
     )

df.show()

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|   7/25/13|              256|PENDING_PAYMENT|
|       2|   7/25/13|            12111|       COMPLETE|
|       3|   7/25/13|             8827|         CLOSED|
|       4|   7/25/13|            11318|       COMPLETE|
|       6|   7/25/13|             4530|       COMPLETE|
|       7|   7/25/13|             2911|     PROCESSING|
|       8|   7/25/13|             5657|PENDING_PAYMENT|
|      10|   7/25/13|              918| PAYMENT_REVIEW|
|      11|   7/25/13|             1837|         CLOSED|
|      12|   7/25/13|             9149|PENDING_PAYMENT|
|      13|   7/25/13|             9842|     PROCESSING|
|      15|   7/25/13|             7276|PENDING_PAYMENT|
|      16|   7/25/13|             2667|       COMPLETE|
|      17|   7/25/13|             1205|         CLOSED|
|      18|   7/25/13|             9488|PENDING_P