# Dataframe Deep Dive (Part 1)

In [None]:
spark

In [None]:
sc

## Dataframe `Schema`

#### With inferSchema = `True`

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders.csv'

df = (spark.read
           .format('csv')
           .option('header', 'true')
           .option('inferSchema', 'true')
           .load(data_set)
     )

In [None]:
df.show(5)

In [None]:
df.printSchema()

#### With `inferSchema` = `True` and `samplingRatio` = `<some ratio>`

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders.csv'

df = (spark.read
           .format('csv')
           .option('header', 'true')
           .option('inferSchema', 'true')
           .option('samplingRatio', 0.01)
           .load(data_set)
     )

In [None]:
df.show(5)

In [None]:
df.printSchema()

#### Enforcing the `schema`

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

In [None]:
df_without_schema = (spark.read
                           .format('csv')
                           .load(data_set)
                     )

In [None]:
df_without_schema.show(5)

In [None]:
df_without_schema.printSchema()

There are 2 ways we can define/enforce the `schema`: 

#### Method 1

In [None]:
orders_schema = 'order_id long, order_date timestamp, order_customer_id long, order_status string'

df_with_schema = (spark.read
                           .format('csv')
                           .schema(orders_schema)
                           .load(data_set)
                     )

In [None]:
df_with_schema.show(5)

In [None]:
df_with_schema.printSchema()

In [None]:
# What if we load some schema which doesnt match with the data 
# It will full the columns with NULLs 


# making order_status as LONG 
orders_schema = 'order_id long, order_date timestamp, order_customer_id long, order_status long'

df_with_schema = (spark.read
                           .format('csv')
                           .schema(orders_schema)
                           .load(data_set)
                     )

In [None]:
df_with_schema.show(5)

In [None]:
df_with_schema.printSchema()

#### Method 2 (using `StructType`)

In [None]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql import types as T

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

orders_schema = StructType([
                            StructField('order_id', T.LongType()),
                            StructField('order_date', T.DateType()),
                            StructField('order_customer_id', T.IntegerType()),
                            StructField('order_status', T.StringType())
                        ])

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

df.show(5)

In [None]:
df.printSchema()

In [None]:
# Lets try to using `IntergerType` for the `order_status` column 

from pyspark.sql import types as T

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

orders_schema = StructType([
                            StructField('order_id', T.LongType()),
                            StructField('order_date', T.DateType()),
                            StructField('order_customer_id', T.IntegerType()),
                            StructField('order_status', T.IntegerType())
                        ])

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

df.show(5)

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

orders_schema = StructType([
                            StructField('order_id', T.LongType(), False),
                            StructField('order_date', T.DateType(), False),
                            StructField('order_customer_id', T.IntegerType(), False),
                            StructField('order_status', T.StringType(), False)
                        ])

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

df.show(5)

In PySpark, the `nullable` parameter in the schema definition using `StructType` **does not enforce** `non-nullability` in the DataFrame. It is primarily used as a hint for optimization purposes and does not restrict the presence of null values in the columns.

Even if you set `nullable=False` for the fields in the schema definition, **it does not guarantee that the corresponding columns in the DataFrame will not contain null values.**



In [None]:
# Lets see the example 
import pyspark.sql.functions as F

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_1.csv'

orders_schema = StructType([
                            StructField('order_id', T.LongType(), False),
                            StructField('order_date', T.DateType(), False),
                            StructField('order_customer_id', T.IntegerType(), False),
                            StructField('order_status', T.StringType(), False)
                        ])

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )


df_filtered = df.filter(F.col('order_date') \
                .isNull())
df_filtered.show()

### `Date` Column

In [None]:
# First lets look into 'orders_1' and 'orders_2' file  

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.DateType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )


In [None]:
# # This will error out  
# df.show()

Two ways to deal with this:
- Load using `String` and later on change it 
- Somehow inform Spark about the exact format of the date schema

#### 1. Load using `String`

In [None]:
import pyspark.sql.functions as F

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

# Step 1 : Load using StringType
orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

# Step 2 : Perform the transformation to convert the data into the right format 
df_transformed = df.withColumn('order_date', F.to_date(F.col('order_date'), "MM-dd-yyyy"))

In [None]:
df_transformed.show(5)

In [None]:
# Old `df` with string dataType (before the step 2) 
df.show(5)

In [None]:
# If we parse WRONG format 

data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

# Step 1 : Load using StringType
orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

# Step 2 : Perform the transformation to convert the date with WRONG format 
df_transformed = df.withColumn('order_date', F.to_date(F.col('order_date'), "dd-MM-yyyy"))

df_transformed.show(5)

#### 2. Loading using `dateFormat`

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_2.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.DateType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('dateFormat', 'MM-dd-yyyy')          # using the dateFormat
           .load(data_set)
     )

df.show(5)

#### Similarly if the column has integers and string, and we load it as a `IntegerType()`, we will get the data as `NULL`

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.DateType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('dateFormat', 'M/dd/yyyy')          
           .load(data_set)
     )

df.show(5)

#### We can change this behaviour and we will see next..

## Modes of reading data

When reading data with Spark, there are different modes available to handle corrupt or malformed records encountered during the read process. These modes determine how Spark should behave when it encounters such records.

- **Permissive mode:** [`DEFAULT`] Permissive mode (mode=`permissive`, which is the default) allows Spark to continue reading the data even if it encounters corrupt or malformed records. When a corrupt record is encountered, Spark tries to parse and load as much data as possible. It inserts `null` or `NaN values` for the corrupt fields and includes the malformed records in the resulting DataFrame. This mode is helpful when you want to handle corrupt records separately or perform additional error handling.

- **Failfast mode:** In this mode (mode=`failfast`), Spark fails immediately upon encountering any corrupt or malformed record. It throws an exception and stops the read operation. No data is returned. This mode is useful when you want to ensure data integrity and immediately identify any issues with the data.

- **Dropmalformed mode:** Dropmalformed mode (mode=`dropmalformed`) instructs Spark to drop any records that cannot be parsed correctly. When a malformed record is encountered, Spark excludes it from the resulting DataFrame entirely. This mode is useful when you want to discard any records that do not conform to the expected schema or format.

#### Permissive mode (`default` mode)

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .load(data_set)
     )

In [None]:
df.show()

#### Failfast mode

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('mode', 'failfast')
           .load(data_set)
     )

In [None]:
# # This will error out
# df.show()

#### Dropmalformed mode

In [None]:
data_set = 's3://fcc-spark-example/dataset/2023/orders/orders_3.csv/'

orders_schema = (StructType()
                 .add(StructField('order_id', T.LongType()))
                 .add(StructField('order_date', T.StringType()))
                 .add(StructField('order_customer_id', T.IntegerType()))
                 .add(StructField('order_status', T.StringType()))
                )

df = (spark.read
           .format('csv')
           .schema(orders_schema)
           .option('mode', 'dropmalformed')
           .load(data_set)
     )

df.show()