In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pyspark.sql.functions as F

**1) Create Empty DataFrame without Schema (no columns)**

In [0]:
# Create empty DatFrame with no schema (no columns)
df3 = spark.createDataFrame([], StructType([]))
display(df3)
df3.printSchema()

root



**2) Create Empty DataFrame with Schema**

In [0]:
# Create Schema
schema = StructType([
  StructField('FirstName', StringType(), True),
  StructField('Age', IntegerType(), True),
  StructField('Experience', IntegerType(), True),
  StructField('Label_Type', StringType(), True),
  StructField('Last_transaction_date', StringType(), True),
  StructField('last_timestamp', StringType(), True),
  StructField('Sensex_Category', StringType(), True)
  ])

In [0]:
# Create empty DataFrame directly.
emp_df = spark.createDataFrame([], schema)
display(emp_df)

FirstName,Age,Experience,Label_Type,Last_transaction_date,last_timestamp,Sensex_Category


**3) Creating empty DataFrame with NULL placeholders**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
# Define schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Sales", IntegerType(), True),
])

# Just placeholders with all nulls
empty_data = [(None, None, None, None),
              (None, None, None, None),
              (None, None, None, None),
              (None, None, None, None),
              (None, None, None, None)]
              
df_null_placeholder = spark.createDataFrame(empty_data, schema)
display(df_null_placeholder)

id,Name,Age,Sales
,,,
,,,
,,,
,,,
,,,


In [0]:
# Define schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Sales", IntegerType(), True),
])

# Just placeholders with all nulls
empty_data = [(1, None, None, None),
              (2, "Neol", 25, 150),
              (3, None, None, None),
              (4, "Niroop", 35, 350),
              (5, None, None, None)]
              
df_null_placeholder = spark.createDataFrame(empty_data, schema)
display(df_null_placeholder)

id,Name,Age,Sales
1,,,
2,Neol,25.0,150.0
3,,,
4,Niroop,35.0,350.0
5,,,


##### 3) How to union existing dataframe to empty dataframe

**⚠️ Things to Avoid**

- **Different schemas:** Will throw an error.
- **Mismatched column names or types:** Columns must match exactly in name, type, and order.

In [0]:
empty_df = spark.createDataFrame([], emp_df.schema)
display(empty_df)

FirstName,Age,Experience,Label_Type,Last_transaction_date,last_timestamp,Sensex_Category


In [0]:
# Sample DataFrame to union
data = [("Kalmesh", 15, 9, "Medium", "23/11/2025", "2025-09-27T19:45:35", "Admin"),
        ("Rohan", 18, 5, "Small", "20/12/2024", "2023-11-29T19:55:35", "Sales"),
        ("Kiran", 19, 3, "Average", "15/06/2023", "2024-09-27T19:35:35", "Marketing"),
        ("Asha", 29, 8, "Short", "13/03/2021", "2021-09-27T19:25:35", "IT"),
        ("Amir", 32, 6, "Long", "17/09/2020", "2018-05-21T15:49:39", "Maintenance"),
        ("Rupesh", 36, 11, "Less", "19/02/2022", "2016-06-27T19:45:35", "Logistics"),
        ("Krupa", 50, 7, "Medium", "12/06/2018", "2019-08-17T22:25:45", "Supplychain"),
        ("Vishnu", 55, 8, "Short", "19/08/2019", "2014-09-27T23:55:45", "Transport"),
        ("Radha", 58, 9, "Long", "26/04/2016", "2015-05-13T15:35:25", "Safety"),
        ]
        
df = spark.createDataFrame(data, schema)
display(df)

FirstName,Age,Experience,Label_Type,Last_transaction_date,last_timestamp,Sensex_Category
Kalmesh,15,9,Medium,23/11/2025,2025-09-27T19:45:35,Admin
Rohan,18,5,Small,20/12/2024,2023-11-29T19:55:35,Sales
Kiran,19,3,Average,15/06/2023,2024-09-27T19:35:35,Marketing
Asha,29,8,Short,13/03/2021,2021-09-27T19:25:35,IT
Amir,32,6,Long,17/09/2020,2018-05-21T15:49:39,Maintenance
Rupesh,36,11,Less,19/02/2022,2016-06-27T19:45:35,Logistics
Krupa,50,7,Medium,12/06/2018,2019-08-17T22:25:45,Supplychain
Vishnu,55,8,Short,19/08/2019,2014-09-27T23:55:45,Transport
Radha,58,9,Long,26/04/2016,2015-05-13T15:35:25,Safety


In [0]:
# Union operation
result_df = empty_df.union(df)
display(result_df)

FirstName,Age,Experience,Label_Type,Last_transaction_date,last_timestamp,Sensex_Category
Kalmesh,15,9,Medium,23/11/2025,2025-09-27T19:45:35,Admin
Rohan,18,5,Small,20/12/2024,2023-11-29T19:55:35,Sales
Kiran,19,3,Average,15/06/2023,2024-09-27T19:35:35,Marketing
Asha,29,8,Short,13/03/2021,2021-09-27T19:25:35,IT
Amir,32,6,Long,17/09/2020,2018-05-21T15:49:39,Maintenance
Rupesh,36,11,Less,19/02/2022,2016-06-27T19:45:35,Logistics
Krupa,50,7,Medium,12/06/2018,2019-08-17T22:25:45,Supplychain
Vishnu,55,8,Short,19/08/2019,2014-09-27T23:55:45,Transport
Radha,58,9,Long,26/04/2016,2015-05-13T15:35:25,Safety


**⚠️ Common Mistake**
- You must always pass a **schema** if the DataFrame is **empty**.

In [0]:
# This will raise an error because schema is not provided or mismatched
empty_df = spark.createDataFrame([])

# Error on this line:
result = empty_df.union(df)

[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
File [0;32m<command-3178736082817283>, line 2[0m
[1;32m      1[0m [38;5;66;03m# This will raise an error because schema is not provided or mismatched[39;00m
[0;32m----> 2[0m empty_df [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame([])
[1;32m      4[0m [38;5;66;03m# Error on this line:[39;00m
[1;32m      5[0m result [38;5;241m=[39m empty_df[38;5;241m.[39munion(df)

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     45[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     46[0m [38;5;28;01mtry[39;00m:
[0;32m---> 47[0m     res [38;5;241m=[39m func([38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs)
[1;32m     48[0m     logger[38;5;241m.[39m

**4) Using loop with empty DataFrame for accumulation**

In [0]:
# Create an empty DataFrame with the same schema
df_loop = spark.createDataFrame([], df.schema)

for i in range(3):
    # Filter the original DataFrame based on Age
    filtered_df = df.filter(df.Age > i * 20)

    # Accumulate the results into df_loop
    df_loop = df_loop.union(filtered_df)

# Display the final accumulated DataFrame
display(df_loop)

FirstName,Age,Experience,Label_Type,Last_transaction_date,last_timestamp,Sensex_Category
Kalmesh,15,9,Medium,23/11/2025,2025-09-27T19:45:35,Admin
Rohan,18,5,Small,20/12/2024,2023-11-29T19:55:35,Sales
Kiran,19,3,Average,15/06/2023,2024-09-27T19:35:35,Marketing
Asha,29,8,Short,13/03/2021,2021-09-27T19:25:35,IT
Amir,32,6,Long,17/09/2020,2018-05-21T15:49:39,Maintenance
Rupesh,36,11,Less,19/02/2022,2016-06-27T19:45:35,Logistics
Krupa,50,7,Medium,12/06/2018,2019-08-17T22:25:45,Supplychain
Vishnu,55,8,Short,19/08/2019,2014-09-27T23:55:45,Transport
Radha,58,9,Long,26/04/2016,2015-05-13T15:35:25,Safety
Asha,29,8,Short,13/03/2021,2021-09-27T19:25:35,IT


**5) Chaining multiple unions with empty DataFrame as starting point**

In [0]:
# Assume df1, df2, df3 all have the same schema
df1 = spark.createDataFrame([(1, "Naresh", 28, "Medium"),
                             (2, "Mohan", 25, "Low"),
                             (3, "Hitesh", 29, "High"),
                             (4, "Vedita", 32, "Less"),
                             (5, "Sushmita", 35, "Higher")],
                             ["id", "Name", "Age", "Type"])

df2 = spark.createDataFrame([(1, "Neha", 21, "Medium"),
                             (2, "Mohin", 26, "Low"),
                             (3, "Hritik", 27, "High"),
                             (4, "Vasu", 35, "Less"),
                             (5, "Susi", 37, "Higher")],
                             ["id", "Name", "Age", "Type"])
df3 = spark.createDataFrame([(1, "Druv", 31, "Medium"),
                             (2, "Eashwar", 33, "Low"),
                             (3, "Guru", 29, "High"),
                             (4, "Vishwak", 39, "Less"),
                             (5, "Sophia", 41, "Higher")],
                             ["id", "Name", "Age", "Type"])

# Start with an empty DataFrame
final_df = spark.createDataFrame([], df1.schema)
display(final_df)

id,Name,Age,Type


In [0]:
# Union multiple DataFrames
final_df = final_df.union(df1).union(df2).union(df3)
display(final_df)

id,Name,Age,Type
1,Naresh,28,Medium
2,Mohan,25,Low
3,Hitesh,29,High
4,Vedita,32,Less
5,Sushmita,35,Higher
1,Neha,21,Medium
2,Mohin,26,Low
3,Hritik,27,High
4,Vasu,35,Less
5,Susi,37,Higher
