# Dataframe Deep Dive (Part 1)

In [1]:
spark

In [2]:
sc

## Dataframe `Schema`

#### With inferSchema = `True`

In [3]:
data_set = 's3://fcc-spark-example/dataset/2023/orders.csv'

df = (spark.read
           .format('csv')
           .option('header', 'true')
           .option('inferSchema', 'true')
           .load(data_set)
     )

                                                                                

In [4]:
df.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [5]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



#### With `inferSchema` = `True` and `samplingRatio` = `<some ratio>`

In [6]:
data_set = 's3://fcc-spark-example/dataset/2023/orders.csv'

df = (spark.read
           .format('csv')
           .option('header', 'true')
           .option('inferSchema', 'true')
           .option('samplingRatio', 0.01)
           .load(data_set)
     )

                                                                                

In [7]:
df.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [8]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



#### Enforcing the `schema`

In [9]:
# Loading WITHOUT enforcing the schema
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

df_without_schema = (spark.read
                           .format('csv')
                           .load(data_set)
                     )

In [10]:
df_without_schema.show(5)

+---+--------------------+-----+---------------+
|_c0|                 _c1|  _c2|            _c3|
+---+--------------------+-----+---------------+
|  1|2013-07-25T00:00:...|11599|         CLOSED|
|  2|2013-07-25T00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25T00:00:...|12111|       COMPLETE|
|  4|2013-07-25T00:00:...| 8827|         CLOSED|
|  5|2013-07-25T00:00:...|11318|       COMPLETE|
+---+--------------------+-----+---------------+
only showing top 5 rows



In [11]:
df_without_schema.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



There are 2 ways we can define/enforce the `schema`: 

#### Method 1

In [12]:
# DDL Style 
orders_schema = 'order_id long, order_date timestamp, order_customer_id long, order_status string'

df_with_schema = (spark.read
                       .format('csv')
                       .schema(orders_schema)
                       .load(data_set)
                )

In [13]:
df_with_schema.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [14]:
df_with_schema.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [15]:
# What if we load some schema which doesnt match with the data 
# It will full the columns with NULLs 


# Lets make the `order_status` as LONG 
orders_schema = 'order_id long, order_date timestamp, order_customer_id long, order_status long'

df_with_schema = (spark.read
                           .format('csv')
                           .schema(orders_schema)
                           .load(data_set)
                     )

In [16]:
df_with_schema.show(5)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       1|2013-07-25 00:00:00|            11599|        null|
|       2|2013-07-25 00:00:00|              256|        null|
|       3|2013-07-25 00:00:00|            12111|        null|
|       4|2013-07-25 00:00:00|             8827|        null|
|       5|2013-07-25 00:00:00|            11318|        null|
+--------+-------------------+-----------------+------------+
only showing top 5 rows



In [17]:
df_with_schema.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: long (nullable = true)
 |-- order_status: long (nullable = true)



#### Method 2 (using `StructType`)

In [18]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql import types as T

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

orders_schema = StructType([
                            StructField('order_id', T.LongType()),
                            StructField('order_date', T.DateType()),
                            StructField('order_customer_id', T.IntegerType()),
                            StructField('order_status', T.StringType())
                        ])

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

df.show(5)

[Stage 10:>                                                         (0 + 1) / 1]

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



                                                                                

In [19]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### `Date` Column

In [None]:
# First lets look into 'orders_1' and 'orders_2' file  

In [20]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.DateType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )


In [22]:
# # This will error out  (ERROR: The value '07-25-2013' of the type "STRING" cannot be cast to "DATE" because it is malformed)
# df.show()

Two ways to deal with this:
- Load using `String` and later on change it 
- Somehow inform Spark about the exact format of the date schema

#### 1. Load using `String`

In [26]:
import pyspark.sql.functions as F

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

# Step 1 : Load using StringType
orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

# Step 2 : Perform the transformation to convert the data into the right format 
df_transformed = df.withColumn('order_date', F.to_date(F.col('order_date'), "MM-dd-yyyy"))
df_transformed.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [27]:
df_transformed.show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



In [28]:
# Old `df` with string dataType (before the step 2) 
df.show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|07-25-2013|            11599|         CLOSED|
|       2|07-25-2013|              256|PENDING_PAYMENT|
|       3|07-25-2013|            12111|       COMPLETE|
|       4|07-25-2013|             8827|         CLOSED|
|       5|07-25-2013|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



In [29]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [30]:
# If we parse WRONG format 

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

# Step 1 : Load using StringType
orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

# Step 2 : Perform the transformation to convert the date with WRONG format 
df_transformed = df.withColumn('order_date', F.to_date(F.col('order_date'), "dd-MM-yyyy"))

df_transformed.show(5)

[Stage 16:>                                                         (0 + 1) / 1]

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|      null|            11599|         CLOSED|
|       2|      null|              256|PENDING_PAYMENT|
|       3|      null|            12111|       COMPLETE|
|       4|      null|             8827|         CLOSED|
|       5|      null|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



                                                                                

#### 2. Loading using `dateFormat`

In [31]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.DateType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('dateFormat', 'MM-dd-yyyy')          # using the dateFormat
           .load(data_set)
     )

df.show(5)

[Stage 17:>                                                         (0 + 1) / 1]

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|            11599|         CLOSED|
|       2|2013-07-25|              256|PENDING_PAYMENT|
|       3|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|             8827|         CLOSED|
|       5|2013-07-25|            11318|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



                                                                                

#### Similarly, if the column has integers and string both, and we load it as a `IntegerType()`, we will get the data as `NULL`

In [32]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.DateType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('dateFormat', 'M/dd/yyyy')          
           .load(data_set)
     )

df.show(5)

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|              256|PENDING_PAYMENT|
|       2|2013-07-25|            12111|       COMPLETE|
|       3|2013-07-25|             null|         CLOSED|
|       4|2013-07-25|            11318|       COMPLETE|
|       5|2013-07-25|             null|       COMPLETE|
+--------+----------+-----------------+---------------+
only showing top 5 rows



#### We can change this behaviour and we will see next..

## Modes of reading data

When reading data with Spark, there are different modes available to handle corrupt or malformed records encountered during the read process. These modes determine how Spark should behave when it encounters such records.

- **Permissive mode:** [`DEFAULT`] Permissive mode (mode=`permissive`, which is the default) allows Spark to continue reading the data even if it encounters corrupt or malformed records. When a corrupt record is encountered, Spark tries to parse and load as much data as possible. It inserts `null` or `NaN values` for the corrupt fields and includes the malformed records in the resulting DataFrame. This mode is helpful when you want to handle corrupt records separately or perform additional error handling.

- **Failfast mode:** In this mode (mode=`failfast`), Spark fails immediately upon encountering any corrupt or malformed record. It throws an exception and stops the read operation. No data is returned. This mode is useful when you want to ensure data integrity and immediately identify any issues with the data.

- **Dropmalformed mode:** Dropmalformed mode (mode=`dropmalformed`) instructs Spark to drop any records that cannot be parsed correctly. When a malformed record is encountered, Spark excludes it from the resulting DataFrame entirely. This mode is useful when you want to discard any records that do not conform to the expected schema or format.

#### Permissive mode (`default` mode)

In [38]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))       # Although this column contains string
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

In [39]:
df.show()

[Stage 22:>                                                         (0 + 1) / 1]

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|              256|PENDING_PAYMENT|
|       2|2013-07-25|            12111|       COMPLETE|
|       3|2013-07-25|             null|         CLOSED|
|       4|2013-07-25|            11318|       COMPLETE|
|       5|2013-07-25|             null|       COMPLETE|
|       6|2013-07-25|             4530|       COMPLETE|
|       7|2013-07-25|             2911|     PROCESSING|
|       8|2013-07-25|             5657|PENDING_PAYMENT|
|       9|2013-07-25|             null|PENDING_PAYMENT|
|      10|2013-07-24|              918| PAYMENT_REVIEW|
|      11|2013-07-24|             1837|         CLOSED|
|      12|2013-07-24|             9149|PENDING_PAYMENT|
|      13|2013-07-24|             9842|     PROCESSING|
|      14|2013-07-24|             null|       COMPLETE|
|      15|2013-07-24|             7276|PENDING_P

                                                                                

#### Failfast mode

In [40]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('mode', 'failfast')
           .load(data_set)
     )

In [42]:
# # This will error out
# df.show()

#### Dropmalformed mode

In [43]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('mode', 'dropmalformed')
           .load(data_set)
     )

df.show()

[Stage 24:>                                                         (0 + 1) / 1]

+--------+----------+-----------------+---------------+
|order_id|order_date|order_customer_id|   order_status|
+--------+----------+-----------------+---------------+
|       1|2013-07-25|              256|PENDING_PAYMENT|
|       2|2013-07-25|            12111|       COMPLETE|
|       4|2013-07-25|            11318|       COMPLETE|
|       6|2013-07-25|             4530|       COMPLETE|
|       7|2013-07-25|             2911|     PROCESSING|
|       8|2013-07-25|             5657|PENDING_PAYMENT|
|      10|2013-07-24|              918| PAYMENT_REVIEW|
|      11|2013-07-24|             1837|         CLOSED|
|      12|2013-07-24|             9149|PENDING_PAYMENT|
|      13|2013-07-24|             9842|     PROCESSING|
|      15|2013-07-24|             7276|PENDING_PAYMENT|
|      16|2013-07-24|             2667|       COMPLETE|
|      17|2013-07-24|             1205|         CLOSED|
|      18|2013-07-24|             9488|PENDING_PAYMENT|
|      19|2013-07-24|             9198|     PROC

                                                                                