-  How to create an **empty PySpark DataFrame/RDD** manually **with or without schema** (column names) in different ways

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType
import pyspark.sql.functions as F

##### 1) Create Empty DataFrame with / without Schema

In [0]:
# Create an empty StructType
schema = StructType()

# Create empty DatFrame with no schema (no columns)
df_empty = spark.createDataFrame([], schema=schema)
display(df_empty)
df_empty.printSchema()

root



**Method 01:** Add fields using .add() method
     
     # Create an empty StructType
     schema = StructType()

     # Add fields to the schema
     schemalist = schema.add("name", StringType(), True)
     schemalist = schema.add("age", IntegerType(), True)

     print(schema)

**Method 02:** Add multiple fields at once by chaining .add()

     schema = StructType().add("id", IntegerType()).add("email", StringType())
     print(schema)

**Method 03:** Create StructFields separately and build StructType

     fields = [
         StructField("username", StringType(), True),
         StructField("score", IntegerType(), False)
     ]

     schema = StructType(fields)
     print(schema)


In [0]:
# Create an empty StructType
schema = StructType()

# Add fields to the schema
schemaList = schema.add("name", StringType(), True)
schemaList = schema.add("age", IntegerType(), True)

print(schema)

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True)])


**Output:**

     StructType([
         StructField('name', StringType(), True),
         StructField('age', IntegerType(), True)
     ])


In [0]:
df_empty_data = spark.createDataFrame([], schema=schemaList)
display(df_empty_data)
df_empty_data.printSchema()

name,age


root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



##### 2) Add fields one by one using .add()

In [0]:
# Create an empty StructType
schema = StructType()

# Add fields to the schema
schemaList = schema.add("name", StringType(), True)
schemaList = schema.add("age", IntegerType(), True)

# Use it in DataFrame creation
data = [("Harish", 30), ("Ramesh", 25), ("Swapna", 29), ("Swetha", 35), ("Anand", 26)]
df_add = spark.createDataFrame(data, schema=schemaList)
display(df_add)

name,age
Harish,30
Ramesh,25
Swapna,29
Swetha,35
Anand,26


##### 3) Add multiple fields using a list of StructFields

In [0]:
fields = [
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
]

schemaList = StructType(fields)

# Example usage
data = [("Harish", 30), ("Ramesh", 25), ("Swapna", 29), ("Swetha", 35), ("Anand", 26)]

df_add_mltpl = spark.createDataFrame(data, schema=schemaList)
display(df_add_mltpl)
df_add_mltpl.printSchema()

name,age
Harish,30
Ramesh,25
Swapna,29
Swetha,35
Anand,26


root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



##### 4) Add nested structures

In [0]:
address_schema = StructType([
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("pincode", IntegerType(), True)
])

# Create an empty StructType
schema = StructType()

# Add fields to the schema
schemaList = schema.add("name", StringType(), True)
schemaList = schema.add("age", IntegerType(), True)
schemaList = schema.add("languagesAtSchool", ArrayType(StringType()))
schemaList = schema.add("languagesAtWork", MapType(StringType(), StringType(), True))
schemaList = schema.add("Properties", MapType(StringType(), IntegerType(), True))
# Add nested structure
schemaList_nested = schemaList.add("address", address_schema, True)

# Example usage
data = [("Harish", 30, ["Spark", "Java", "C++"], {"Domain": "Gas", "Branch": "IT", "Designation": "DE"}, {"Age": 25, "emp_id": 768954, "Exp": 5}, ("27th Main", "Bangalore", 5132109)),
        ("Ramesh", 25, ["Java", "Scala", "C++"], {"Domain": "DS", "Branch": "CSC", "Designation": "DE"}, {"Age": 30, "emp_id": 768956, "Exp": 2}, ("3rd cross", "Hyderabad", 5674321)),
        ("Swapna", 29, ["Devops", "VB"], {"Domain": "Trade", "Branch": "EEE", "Designation": "DE"}, {"Age": 28, "emp_id": 798954, "Exp": 8}, ("4th cross", "Chennai", 49087654)),
        ("Swetha", 35, ["CSharp", "VB", "Python"], {"Domain": "Sales", "Branch": "AI", "Designation": "DE"}, {"Age": 35, "emp_id": 788956, "Exp": 6}, ("4th Avenue", "Delhi", 4532167)),
        ("Anand", 26, ["PySpark", "SQL"], {"Domain": "TELE", "Branch": "ECE", "Designation": "DE"}, {"Age": 21, "emp_id": 769954, "Exp": 9}, ("5th Avenue", "Mumbai", 5760981))
        ]
df_nested = spark.createDataFrame(data, schema=schemaList_nested)
display(df_nested)

name,age,languagesAtSchool,languagesAtWork,Properties,address
Harish,30,"List(Spark, Java, C++)","Map(Designation -> DE, Domain -> Gas, Branch -> IT)","Map(Exp -> 5, Age -> 25, emp_id -> 768954)","List(27th Main, Bangalore, 5132109)"
Ramesh,25,"List(Java, Scala, C++)","Map(Designation -> DE, Domain -> DS, Branch -> CSC)","Map(Exp -> 2, Age -> 30, emp_id -> 768956)","List(3rd cross, Hyderabad, 5674321)"
Swapna,29,"List(Devops, VB)","Map(Designation -> DE, Domain -> Trade, Branch -> EEE)","Map(Exp -> 8, Age -> 28, emp_id -> 798954)","List(4th cross, Chennai, 49087654)"
Swetha,35,"List(CSharp, VB, Python)","Map(Designation -> DE, Domain -> Sales, Branch -> AI)","Map(Exp -> 6, Age -> 35, emp_id -> 788956)","List(4th Avenue, Delhi, 4532167)"
Anand,26,"List(PySpark, SQL)","Map(Designation -> DE, Domain -> TELE, Branch -> ECE)","Map(Exp -> 9, Age -> 21, emp_id -> 769954)","List(5th Avenue, Mumbai, 5760981)"
