# Dataframe Deep Dive (Part 3)

In [1]:
spark

In [2]:
sc

## Creating a Dataframe using JSON with nested `schema`

#### Using `DDL schema`

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

data_set = 's3://fcc-spark-example/dataset/2023/players.json'

# Define the schema of the JSON data
schema = 'country string, player_details struct<name:string, age:integer, role:string>, player_id integer, team_id integer'

df = (spark.read
           .format('json')
           .schema(schema)
           .load(data_set)
     )

In [4]:
df.show(truncate=False)

[Stage 0:>                                                          (0 + 1) / 1]

+-------+--------------------------+---------+-------+
|country|player_details            |player_id|team_id|
+-------+--------------------------+---------+-------+
|IND    |{R Sharma, 33, Batsman}   |101      |11     |
|IND    |{S Iyer, 25, Batsman}     |102      |15     |
|NZ     |{T Boult, 30, Bowler}     |103      |13     |
|IND    |{MS Dhoni, 38, WKeeper}   |104      |14     |
|AUS    |{S Watson, 39, Allrounder}|105      |12     |
|WI     |{S Hetmyer, 23, Batsman}  |106      |16     |
+-------+--------------------------+---------+-------+



                                                                                

In [5]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- player_details: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)
 |    |-- role: string (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- team_id: integer (nullable = true)



#### Using `StructType`

In [6]:
schema = StructType([
                    StructField('country', StringType()),
                    StructField('player_details', StructType([
                        StructField('name', StringType()),
                        StructField('age', IntegerType()),
                        StructField('role', StringType())
                    ])),
                    StructField('player_id', IntegerType()),
                    StructField('team_id', StringType())
                ])

df = spark.read.format('json').schema(schema).load(data_set)

In [7]:
df.show(truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+-------+--------------------------+---------+-------+
|country|player_details            |player_id|team_id|
+-------+--------------------------+---------+-------+
|IND    |{R Sharma, 33, Batsman}   |101      |11     |
|IND    |{S Iyer, 25, Batsman}     |102      |15     |
|NZ     |{T Boult, 30, Bowler}     |103      |13     |
|IND    |{MS Dhoni, 38, WKeeper}   |104      |14     |
|AUS    |{S Watson, 39, Allrounder}|105      |12     |
|WI     |{S Hetmyer, 23, Batsman}  |106      |16     |
+-------+--------------------------+---------+-------+



                                                                                

In [8]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- player_details: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)
 |    |-- role: string (nullable = true)
 |-- player_id: integer (nullable = true)
 |-- team_id: string (nullable = true)

