In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ManualSchema") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.cores", "2") \
    .config("spark.cores.max", "4") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/01 04:16:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Reading JSON - Line Delimited

json_ld_df = spark.read.format('json')\
            .option('inferschema', 'true')\
            .option('mode','PERMISSIVE')\
            .load('/opt/spark-data/input/line_delimited_json.json')

json_ld_df.show(5)

                                                                                

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



                                                                                

In [3]:
# Reading JSON - Line Delimited + Non Similar Strucutre

json_ld_nss_df = spark.read.format('json')\
            .option('inferschema', 'true')\
            .option('mode','PERMISSIVE')\
            .load('/opt/spark-data/input/line_delimited_irregular_fields_json.json')

json_ld_nss_df.show(5)

+---+------+--------+------+
|age|gender|    name|salary|
+---+------+--------+------+
| 20|  NULL|  Manish| 20000|
| 25|  NULL|  Nikita| 21000|
| 16|  NULL|  Pritam| 22000|
| 35|  NULL|Prantosh| 25000|
| 67|     M|  Vikash| 40000|
+---+------+--------+------+



In [5]:
# Reading JSON - Multiline - correct data

json_ml_c_df = spark.read.format('json')\
            .option('inferschema', 'true')\
            .option('mode','PERMISSIVE')\
            .option('multiline', 'true')\
            .load('/opt/spark-data/input/multiline_correct_json.json')

json_ml_c_df.show(5)

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



In [8]:
# Reading JSON - Multiline - incorrect data

json_ml_ic_df = spark.read.format('json')\
            .option('inferschema', 'true')\
            .option('mode','PERMISSIVE')\
            .option('multiline', 'true')\
            .load('/opt/spark-data/input/multiline_incorrect_json.json')

json_ml_ic_df.show(5) # shows only 1 line and threw no error!

+---+------+------+
|age|  name|salary|
+---+------+------+
| 20|Manish| 20000|
+---+------+------+



In [9]:
# Reading JSON - Corrupted

json_corrupt_df = spark.read.format('json')\
            .option('inferschema', 'true')\
            .option('mode','PERMISSIVE')\
            .load('/opt/spark-data/input/corrupted_json.json')

json_corrupt_df.show(5, truncate = False)

+----------------------------------------+----+--------+------+
|_corrupt_record                         |age |name    |salary|
+----------------------------------------+----+--------+------+
|NULL                                    |20  |Manish  |20000 |
|NULL                                    |25  |Nikita  |21000 |
|NULL                                    |16  |Pritam  |22000 |
|NULL                                    |35  |Prantosh|25000 |
|{"name":"Vikash","age":67,"salary":40000|NULL|NULL    |NULL  |
+----------------------------------------+----+--------+------+



In [10]:
# Reading JSON - Different dtypes

json_dtype_var_df = spark.read.format('json')\
            .option('inferschema', 'true')\
            .option('mode','PERMISSIVE')\
            .load('/opt/spark-data/input/line_delimited_dtype_variation_json.json')

json_dtype_var_df.show(5, truncate = False) # No change, as it was anyways infering every column as string

+---+--------+------+
|age|name    |salary|
+---+--------+------+
|20 |Manish  |20000 |
|25 |Nikita  |21000 |
|16 |Pritam  |22000 |
|35 |Prantosh|25000 |
|67 |Vikash  |40000 |
+---+--------+------+



In [12]:
json_dtype_var_df.describe()

DataFrame[summary: string, age: string, name: string, salary: string]

In [13]:
json_ld_nss_df.describe()

DataFrame[summary: string, age: string, gender: string, name: string, salary: string]