In [0]:
spark

-------- Read The CSV Data File

In [0]:
read_csv_df = spark.read.format("csv")\
                        .option("header", "false")\
                        .option("infraschema", "false")\
                        .option("mode", "FAILFAST")\
                        .load("/FileStore/tables/simple_zipcodes-2.csv")
read_csv_df.show(5)

+------------+-------+-------------------+-------+-----+
|         _c0|    _c1|                _c2|    _c3|  _c4|
+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
|           1|     US|        PARC PARQUE|    704|   PR|
|           2|     US|PASEO COSTA DEL SUR|    704|   PR|
|          10|     US|       BDA SAN LUIS|    709|   PR|
|       49347|     US|               HOLT|  32564|   FL|
+------------+-------+-------------------+-------+-----+
only showing top 5 rows



Print The Default Schema

In [0]:
read_csv_df = spark.read.format("csv")\
                        .option("header", "true")\
                        .option("infraschema", "true")\
                        .option("mode", "FAILFAST")\
                        .load("/FileStore/tables/simple_zipcodes-2.csv")
read_csv_df.printSchema()

root
 |-- RecordNumber: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- State: string (nullable = true)



Create Our Own Schema

In [0]:
# import libraries
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

country_schema = StructType([
    StructField("RecordNumber", IntegerType(), True),
    StructField("Country", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Zipcode", IntegerType(), True),
    StructField("State", StringType(), True)
])

# add the own schema here...
read_csv_df = spark.read.format("csv")\
                        .option("header", "true")\
                        .option("infraschema", "false")\
                        .option("mode", "FAILFAST")\
                        .schema(country_schema)\
                        .load("/FileStore/tables/simple_zipcodes-2.csv")
read_csv_df.printSchema()

root
 |-- RecordNumber: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- State: string (nullable = true)



Check Different Modes

In [0]:
# create the another schema
schema = "Id int, Name string, Country string"
data = ([1, "John", "US"], [2, "Sem", "US, NewYork"], [3, "Nikel", None], [4, "Fdk", "India, Bangalore"])

test_df = spark.createDataFrame(data, schema)
test_df.collect()
test_df.show()

# write the df into specific pat
test_df.write.format('csv').mode('overwrite').option('header', "true").save('/FileStore/tables/simple_file1.csv')
                                                                            
# check the different mode

+---+-----+----------------+
| Id| Name|         Country|
+---+-----+----------------+
|  1| John|              US|
|  2|  Sem|     US, NewYork|
|  3|Nikel|            null|
|  4|  Fdk|India, Bangalore|
+---+-----+----------------+



Check File Data Corrupted Or Not

In [0]:
%fs
ls /FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/badrecords/,badrecords/,0,0
dbfs:/FileStore/tables/multiline_zipcode-1.json,multiline_zipcode-1.json,239,1725625566000
dbfs:/FileStore/tables/multiline_zipcode-2.json,multiline_zipcode-2.json,239,1725625726000
dbfs:/FileStore/tables/multiline_zipcode.json,multiline_zipcode.json,239,1725625362000
dbfs:/FileStore/tables/simple_file1.csv/,simple_file1.csv/,0,0
dbfs:/FileStore/tables/simple_file2.csv,simple_file2.csv,139,1725879650000
dbfs:/FileStore/tables/simple_zipcodes-1.csv,simple_zipcodes-1.csv,590,1725625566000
dbfs:/FileStore/tables/simple_zipcodes-2.csv,simple_zipcodes-2.csv,590,1725626333000
dbfs:/FileStore/tables/simple_zipcodes.csv,simple_zipcodes.csv,590,1725625362000


In [0]:
# use the different modes for that need to add schema with _corrupt_record
# ----- FAILFAST----------
res_test_df = spark.read.format('csv')\
                        .option('header', "true")\
                        .option('mode', 'FAILFAST')\
                        .schema('Id int, Name string, Country string, _Corrupt_Record string')\
                        .load('/FileStore/tables/simple_file1.csv')

res_test_df.display()

# ----- DROPMALFORMED----------
res_test_df1 = spark.read.format('csv')\
                         .option('header', "true")\
                         .option('delimeter', ',')\
                         .option('mode', 'DROPMALFORMED')\
                         .schema('Id int, Name string, Country string, Nominee string, _corrupt_record string')\
                         .load('/FileStore/tables/simple_file2.csv')

res_test_df1.display()

# ----- PERMISSIVE----------
res_test_df1 = spark.read.format('csv')\
                         .option('header', "true")\
                         .option('delimeter', ',')\
                         .option('mode', 'PERMISSIVE')\
                         .schema('Id int, Name string, Country string, Nominee string, _corrupt_record string')\
                         .load('/FileStore/tables/simple_file2.csv')

res_test_df1.display()

Id,Name,Country,Nominee,_corrupt_record
1,John,US,nominee1,
2,Sem,US,newyork,"2, Sem, US, newyork, nominee2"
3,Nikel,,nominee3,
4,Fdk,India,Bangalore,"4, Fdk, India, Bangalore, nominee4"


Save/Corrupted The Corrupted Record

In [0]:
# store the corrupted records in the given file path as json format
res_test_df1 = spark.read.format('csv')\
                         .option('header', "true")\
                         .option('delimeter', ',')\
                         .schema('Id int, Name string, Country string, Nominee string, _corrupt_record string')\
                         .option('badRecordsPath', '/FileStore/tables/badrecords')\
                         .load('/FileStore/tables/simple_file2.csv')

res_test_df1.display()

Id,Name,Country,Nominee,_corrupt_record
1,John,US,nominee1,
3,Nikel,,nominee3,


------------ Read The Json Data File

In [0]:
%fs
ls /FileStore/tables/badrecords/20240910T054501/bad_records

path,name,size,modificationTime
dbfs:/FileStore/tables/badrecords/20240910T054501/bad_records/part-00000-7557b3eb-b5de-41d3-97cd-d48f56b93db5,part-00000-7557b3eb-b5de-41d3-97cd-d48f56b93db5,446,1725947104000


In [0]:
# read the corrupted record from bad record file path..
bad_record_df = spark.read.format('json').load('/FileStore/tables/badrecords/20240910T054501/bad_records/part-00000-7557b3eb-b5de-41d3-97cd-d48f56b93db5')
bad_record_df.show(truncate = False)

+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------+----------------------------------+
|path                                   |reason                                                                                                                 |record                            |
+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------+----------------------------------+
|dbfs:/FileStore/tables/simple_file2.csv|org.apache.spark.SparkRuntimeException: [MALFORMED_CSV_RECORD] Malformed CSV record: 2, Sem, US, newyork, nominee2     |2, Sem, US, newyork, nominee2     |
|dbfs:/FileStore/tables/simple_file2.csv|org.apache.spark.SparkRuntimeException: [MALFORMED_CSV_RECORD] Malformed CSV record: 4, Fdk, India, Bangalore, nominee4|4, Fdk, India, Bangalore, nominee4|
+--------------

Read The Different Json File.
Here line delimeter json is faster than multiline json

In [0]:
%fs
ls /FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/Multi_line_correct.json,Multi_line_correct.json,310,1726033887000
dbfs:/FileStore/tables/Multi_line_incorrect.json,Multi_line_incorrect.json,304,1726033888000
dbfs:/FileStore/tables/badrecords/,badrecords/,0,0
dbfs:/FileStore/tables/corrupted_json.json,corrupted_json.json,218,1726033889000
dbfs:/FileStore/tables/line_delimited_json.json,line_delimited_json.json,219,1726033889000
dbfs:/FileStore/tables/multiline_nested_json.json,multiline_nested_json.json,610,1726048295000
dbfs:/FileStore/tables/multiline_zipcode-2.json,multiline_zipcode-2.json,239,1725625726000
dbfs:/FileStore/tables/simple_file1.csv/,simple_file1.csv/,0,0
dbfs:/FileStore/tables/simple_file2.csv,simple_file2.csv,139,1725879650000
dbfs:/FileStore/tables/simple_zipcodes-2.csv,simple_zipcodes-2.csv,590,1725626333000


In [0]:
%fs
rm dbfs:/FileStore/tables/simple_zipcodes-1.csv

In [0]:
# read the line delimiter json
line_delimiter_df = spark.read.format('json')\
                              .option('header', 'true')\
                              .option('mode', 'PERMISSIVE')\
                              .load('/FileStore/tables/line_delimited_json.json').show()

# read the line delimeter data with extra filed
line_delimiter_json_with_extra_field_df = spark.read.format('json')\
                              .option('header', 'true')\
                              .option('mode', 'PERMISSIVE')\
                              .load('/FileStore/tables/single_file_json_with_extra_fields.json').show()

# read the line delimeter corrupted data
corrupted_json_df = spark.read.format('json')\
                              .option('header', 'true')\
                              .option('mode', 'PERMISSIVE')\
                              .load('/FileStore/tables/corrupted_json.json').show()

# read the multiline correct data
multiline_correct_json_df = spark.read.format('json')\
                              .option('header', 'true')\
                              .option('multiline', 'true')\
                              .option('mode', 'PERMISSIVE')\
                              .load('/FileStore/tables/Multi_line_correct.json').show()

# read the multiline incorrect data
multiline_incorrect_json_df = spark.read.format('json')\
                              .option('header', 'true')\
                              .option('multiline', 'true')\
                              .option('mode', 'PERMISSIVE')\
                              .load('/FileStore/tables/Multi_line_incorrect.json').show()


# read the multiline nested file data
multiline_nested_json_df = spark.read.format('json')\
                              .option('header', 'true')\
                              .option('multiline', 'true')\
                              .option('mode', 'PERMISSIVE')\
                              .load('/FileStore/tables/multiline_nested_json.json')
multiline_nested_json_df.show(truncate=False)
multiline_nested_json_df.printSchema()


+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+

+---+------+--------+------+
|age|gender|    name|salary|
+---+------+--------+------+
| 20|  null|  Manish| 20000|
| 25|  null|  Nikita| 21000|
| 16|  null|  Pritam| 22000|
| 35|  null|Prantosh| 25000|
| 67|     M|  Vikash| 40000|
+---+------+--------+------+

+--------------------+----+--------+------+
|     _corrupt_record| age|    name|salary|
+--------------------+----+--------+------+
|                null|  20|  Manish| 20000|
|                null|  25|  Nikita| 21000|
|                null|  16|  Pritam| 22000|
|                null|  35|Prantosh| 25000|
|{"name":"Vikash",...|null|    null|  null|
+--------------------+----+--------+------+

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prant