<a href="https://colab.research.google.com/github/elbyvaz/data_engineering/blob/main/spark/corrupted_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Como lidar com dados corrompidos?
# Quando o valor lido é diferente do esperado...
# https://www.linkedin.com/posts/rahul-patidar-603334163_spark-bigdata-dataengineering-activity-7049812091712524288-7BiP/?utm_source=share&utm_medium=member_ios

from pyspark.sql import SparkSession


spark = SparkSession \
    .builder \
    .appName("Apache Spark - Lidando com dados corrompidos") \
    .getOrCreate()

spark.version

In [None]:
# SOLUCAO 1: PERMISSIVO
# define the schema for the data
schema = "id INT, name STRING, age INT"

# read data from a JSON file and specify the name of the column for corrupt records
df = spark.read.schema(schema).option("mode", "PERMISSIVE").json("pessoas.json")

# show the DataFrame
df.show()

# O problema fica meio escondido!

In [None]:
# SOLUCAO 2: GUARDAR REGISTRO CORROMPIDO
from pyspark.sql import SparkSession

# create a SparkSession
spark = SparkSession.builder.appName("columnNameOfCorruptRecord_example").getOrCreate()

# define the schema for the data
schema = "id INT, name STRING, age INT, corrupt_record STRING"

# read data from a JSON file and specify the name of the column for corrupt records
df = spark.read.schema(schema).option("columnNameOfCorruptRecord", "corrupt_record").json("pessoas.json")

# show the DataFrame
df.show(truncate=False)

In [None]:
# SOLUCAO 3: FAIL FAST
# define the schema for the data
schema = "id INT, name STRING, age INT, corrupt_record STRING"

# read data from a JSON file and specify the name of the column for corrupt records
df = spark.read.schema(schema).option("mode", "FAILFAST").json("pessoas.json")

# show the DataFrame
df.show(truncate=False)

In [None]:
# SOLUCAO 4: JOGAR FORA O REGISTRO CORROMPIDO

# define the schema for the data
schema = "id INT, name STRING, age INT"

# read data from a JSON file and specify the name of the column for corrupt records
df = spark.read.schema(schema).option("mode", "DROPMALFORMED").json("pessoas.json")

# show the DataFrame
df.show(truncate=False)

In [None]:
# SOLUCAO 5: GUARDAR REGISTRO CORROMPIDO EM DISCO
# Apenas DataBricks:
# https://stackoverflow.com/questions/75126205/spark-badrecordspath-is-not-writing-records-to-the-path-as-expected
# define the schema for the data
schema = "id INT, name STRING, age INT"

# read data from a JSON file and specify the name of the column for corrupt records
df = spark.read.schema(schema).option("badRecordsPath", "C:/Users/pedro.guerra/Downloads/AULAS/XPE/engenharia-dados/big-data-spark/aula2/registros_corrompidos.json").json("pessoas.json")

# show the DataFrame
df.show(truncate=False)