## StructType & StructField

In [0]:
dbutils.library.restartPython() # Removes Python state, but some libraries might not work without calling this command.dbutils.restartPython()

#### Load libraries

In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import IntegerType, DateType, StringType, StructType, StructField

#### Create Spark session

In [0]:
spark = SparkSession.builder.appName('StructType & StructField').getOrCreate()

#### Defining Schema with StructType

In [0]:
data = [
  ('John', '', 'Smith', '36636', 'M', 2500),
  ('Jane', '', 'Doe', '42114', 'F', 500),
  ('Richard', 'Laurence', 'Marquette', '97086', 'M', 1500),
  ('Israel', '', 'Israeli', '', 'M', 3000),
  ('Edward', 'III', '', 'SL4', 'M', 5000)
]
 
schema = StructType([
  StructField('firstname', StringType(),True),
  StructField('middlename', StringType(),True),
  StructField('lastname', StringType(),True),
  StructField('id', StringType(), True),
  StructField('gender', StringType(), True),
  StructField('salary', IntegerType(), True)
])
 
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show(truncate=False)

#### Defining Nested StructType object struct

In [0]:
dataNested = [
  (('John', '', 'Smith'), '36636', 'M', 2500),
  (('Jane', '', 'Doe'), '42114', 'F', 500),
  (('Richard', 'Laurence', 'Marquette'), '97086', 'M', 1500),
  (('Israel', '', 'Israeli'), '', 'M', 3000),
  (('Edward', 'III', ''), 'SL4', 'M', 5000)
]
 
schemaNested = StructType([
  StructField('name', StructType([
    StructField('firstname', StringType(),True),
    StructField('middlename', StringType(),True),
    StructField('lastname', StringType(),True)
  ])),
  StructField('zip', StringType(), True),
  StructField('gender', StringType(), True),
  StructField('salary', IntegerType(), True)
])
 
dfNested = spark.createDataFrame(data=dataNested, schema=schemaNested)
dfNested.printSchema()
dfNested.show(truncate=False)

#### Adding & Changing struct of the DataFrame

In [0]:
from pyspark.sql.functions import col, struct, when

updatedDF = dfNested.withColumn(
  'OtherInfo', 
  struct(
    # Copy the columns from one structure to another ...
    col('zip').alias('identifier'),
    col('gender').alias('gender'),
    col('salary').alias('salary'),
    # ... and adding a new column
    when(col('salary').cast(IntegerType()) < 2000,'Low')
    .when(col('salary').cast(IntegerType()) < 4000,'Medium')
    .otherwise('High').alias('salary_grade')
)).drop('zip','gender','salary') # drop old columns

updatedDF.printSchema()
updatedDF.show(truncate=False)

#### Creating StructType object struct from JSON

In [0]:
schemaJSON = '''
{
  "type" : "struct",
  "fields" : [ {
    "name" : "name",
    "type" : {
      "type" : "struct",
      "fields" : [ {
        "name" : "firstname",
        "type" : "string",
        "nullable" : true,
        "metadata" : { }
      }, {
        "name" : "middlename",
        "type" : "string",
        "nullable" : true,
        "metadata" : { }
      }, {
        "name" : "lastname",
        "type" : "string",
        "nullable" : true,
        "metadata" : { }
      } ]
    },
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "zip",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "gender",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "salary",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  } ]
}'''

In [0]:
import json

schemaFromJson = StructType.fromJson(json.loads(schemaJSON))

df3 = spark.createDataFrame(dataNested,schemaFromJson)
df3.printSchema()
df3.show(truncate=False)

In [0]:
# Also you can print schema as JSON
print(dfNested.schema.json())

#### Checking if a Column Exists in a DataFrame

In [0]:
print('firstname' in df.schema.fieldNames())
print(StructField('firstname',StringType(),True) in df.schema)

#### The end of the notebook