In [1]:
run ./00_Load_Demo_Data.ipynb

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)

root
 |-- department_id: string (nullable = true)
 |-- department_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- budget: string (nullable = true)



In [9]:
'''
    On spark.read spark will run a job that will read the first row of the file to identify the number of column
    option("header","True"): it will load the first row as headers, all data types will be string
    option("inferSchema",True): will trigger two jobs, one that will read the first rows and take as headers and the second will scan the data to identify the data types
'''

df = spark.read.format("csv").option("header","True").option("inferSchema",True).load("/home/jovyan/data/emp.csv")

In [10]:
df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: timestamp (nullable = true)



In [13]:
schema = "employee_id integer, department_id integer, name string, age integer, gender string, salary integer, hire_date timestamp"

df_schema = spark.read.format("csv").schema(schema).option("header",True).load("/home/jovyan/data/emp.csv")


In [14]:
df_schema.show()

+-----------+-------------+-------------+---+------+------+-------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|
+-----------+-------------+-------------+---+------+------+-------------------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|
|          7|          101|James Johnson| 42|  Male| 70000|2012-03-15 00:00:00|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01 00:00:00|
|          9|          103|      Tom Tan| 33|  Male| 58000|2016-06-01 00:00:00|
|         10|          104|     Lisa Lee

In [20]:
'''
    Read mode: can be used only when we define a schema
               PERMISSIVE (default), columnNameOfCorruptRecord: 
                                                                when row does not match the schema then it will place it into the column _corrupt_record
                                                                _corrupt_record or columnNameOfCorruptRecord must exists in dataframe and read will fill it
               DROPMALFORMED: will drop the row
               FAILFAST: fail when find row that didnt match with the schema
              
               
'''

schema_modes = "employee_id integer, department_id integer, name string, age integer, gender string, salary integer, hire_date timestamp, _corrupt_record string"

df_schema_modes = spark.read.format("csv").schema(schema_modes).option("header",True).load("/home/jovyan/data/emp_new.csv")

In [21]:
df_schema_modes.show()

+-----------+-------------+-------------+---+------+------+-------------------+--------------------+
|employee_id|department_id|         name|age|gender|salary|          hire_date|     _corrupt_record|
+-----------+-------------+-------------+---+------+------+-------------------+--------------------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01 00:00:00|                null|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15 00:00:00|                null|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01 00:00:00|                null|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30 00:00:00|                null|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01 00:00:00|                null|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01 00:00:00|                null|
|          7|          101|James Johnson| 42|  Male|  null|2012-03-15 00:00:00|007,101,Jame

In [24]:
'''
    Pass the options as dictionary
'''

_options = {
     "header": "true"
    ,"inferSchema": "true"
    ,"mode": "FALFAST"
}

(spark.read.format("csv").options(**_options).load("/home/jovyan/data/emp_new.csv").show())

+-----------+-------------+-------------+---+------+------+----------+
|employee_id|department_id|         name|age|gender|salary| hire_date|
+-----------+-------------+-------------+---+------+------+----------+
|          1|          101|     John Doe| 30|  Male| 50000|2015-01-01|
|          2|          101|   Jane Smith| 25|Female| 45000|2016-02-15|
|          3|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|
|          4|          102|    Alice Lee| 28|Female| 48000|2017-09-30|
|          5|          103|    Jack Chan| 40|  Male| 60000|2013-04-01|
|          6|          103|    Jill Wong| 32|Female| 52000|2018-07-01|
|          7|          101|James Johnson| 42|  Male|   Low|2012-03-15|
|          8|          102|     Kate Kim| 29|Female| 51000|2019-10-01|
|          9|          103|      Tom Tan| 33|  Male| 58000|2016-06-01|
|         10|          104|     Lisa Lee| 27|Female| 47000|2018-08-01|
|         11|          104|   David Park| 38|  Male| 65000|   no date|
|     