## union multiple DataFrames together in PySpark

- union()
- unionByName()

In [0]:
from pyspark.sql.types import StructType , StructField , StringType , IntegerType

# Define schema for df_1 & df_2 (Customers)
schema_customers = StructType([
    StructField("id", IntegerType(), True),
    StructField("customer_name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True)
])

# Data for df_1: customer master data
data_customers = [
    (201, "Amit Verma", "Delhi","India"),
    (202, "Neha Sharma", "Mumbai","India"),
    (203, "Rahul Mehta", "Bangalore","India"),
    (204, "Pooja Singh", "Hyderabad","India"),
    (205, "Karan Malhotra", "Chennai","India")
]

# Create Customers DataFrame
df_1 = spark.createDataFrame(data_customers, schema=schema_customers)
df_1.show()


# Data for df_2: new customer master data
data_customers_new = [
    (206, "Sanjay Patel", "Ahmedabad", "India"),
    (207, "Meena Iyer", "Pune", "India"),
    (208, "Vikram Desai", "Kolkata", "India"),
    (209, "Anita Nair", "Kochi", "India"),
    (210, "Rohit Kapoor", "Jaipur", "India")
]

# Create Customers DataFrame with new records
df_2 = spark.createDataFrame(data_customers_new, schema=schema_customers)
df_2.show()



# Define schema for df_3 (Customers)
schema_customers_df_3 = StructType([
    StructField("id", IntegerType(), True),
    StructField("customer_name", StringType(), True),
    StructField("city", StringType(), True)
])

# Data for df_3: new customer master data
data_customers_new = [
    (206, "Sanjay Patel", "Ahmedabad"),
    (207, "Meena Iyer", "Pune"),
    (208, "Vikram Desai", "Kolkata"),
    (209, "Anita Nair", "Kochi"),
    (210, "Rohit Kapoor", "Jaipur")
]

# Create Customers DataFrame with new records
df_3 = spark.createDataFrame(data_customers_new, schema=schema_customers_df_3)
df_3.show()


+---+--------------+---------+-------+
| id| customer_name|     city|country|
+---+--------------+---------+-------+
|201|    Amit Verma|    Delhi|  India|
|202|   Neha Sharma|   Mumbai|  India|
|203|   Rahul Mehta|Bangalore|  India|
|204|   Pooja Singh|Hyderabad|  India|
|205|Karan Malhotra|  Chennai|  India|
+---+--------------+---------+-------+

+---+-------------+---------+-------+
| id|customer_name|     city|country|
+---+-------------+---------+-------+
|206| Sanjay Patel|Ahmedabad|  India|
|207|   Meena Iyer|     Pune|  India|
|208| Vikram Desai|  Kolkata|  India|
|209|   Anita Nair|    Kochi|  India|
|210| Rohit Kapoor|   Jaipur|  India|
+---+-------------+---------+-------+

+---+-------------+---------+
| id|customer_name|     city|
+---+-------------+---------+
|206| Sanjay Patel|Ahmedabad|
|207|   Meena Iyer|     Pune|
|208| Vikram Desai|  Kolkata|
|209|   Anita Nair|    Kochi|
|210| Rohit Kapoor|   Jaipur|
+---+-------------+---------+



## union ()

- union() is a transformation
- It appends rows
- It works positionally, NOT by column name
- Number of columns must match

In [0]:
df_1.union(df_2).display()

id,customer_name,city,country
201,Amit Verma,Delhi,India
202,Neha Sharma,Mumbai,India
203,Rahul Mehta,Bangalore,India
204,Pooja Singh,Hyderabad,India
205,Karan Malhotra,Chennai,India
206,Sanjay Patel,Ahmedabad,India
207,Meena Iyer,Pune,India
208,Vikram Desai,Kolkata,India
209,Anita Nair,Kochi,India
210,Rohit Kapoor,Jaipur,India


###  Union df_1 and df_3 (Failure Case)

In [0]:
df_1.union(df_3).display()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-5101178971645604>, line 1[0m
[0;32m----> 1[0m df_1[38;5;241m.[39munion(df_3)[38;5;241m.[39mdisplay()

File [0;32m/databricks/python_shell/lib/dbruntime/monkey_patches.py:72[0m, in [0;36mapply_dataframe_display_patch.<locals>.df_display[0;34m(df, *args, **kwargs)[0m
[1;32m     68[0m [38;5;28;01mdef[39;00m [38;5;21mdf_display[39m(df, [38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs):
[1;32m     69[0m [38;5;250m    [39m[38;5;124;03m"""[39;00m
[1;32m     70[0m [38;5;124;03m    df.display() is an alias for display(df). Run help(display) for more information.[39;00m
[1;32m     71[0m [38;5;124;03m    """[39;00m
[0;32m---> 72[0m     display(df, [38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs)

File [0;32m/databricks/python_shell/lib/dbru

## Forcing union() Using Column Position

In [0]:
df_1.union(df_3.select("id","customer_name","city","city")).display()

id,customer_name,city,country
201,Amit Verma,Delhi,India
202,Neha Sharma,Mumbai,India
203,Rahul Mehta,Bangalore,India
204,Pooja Singh,Hyderabad,India
205,Karan Malhotra,Chennai,India
206,Sanjay Patel,Ahmedabad,Ahmedabad
207,Meena Iyer,Pune,Pune
208,Vikram Desai,Kolkata,Kolkata
209,Anita Nair,Kochi,Kochi
210,Rohit Kapoor,Jaipur,Jaipur


## unionByName()

In [0]:
df_1.unionByName(df_3).display()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-5101178971645590>, line 1[0m
[0;32m----> 1[0m df_1[38;5;241m.[39munionByName(df_3)[38;5;241m.[39mdisplay()

File [0;32m/databricks/python_shell/lib/dbruntime/monkey_patches.py:72[0m, in [0;36mapply_dataframe_display_patch.<locals>.df_display[0;34m(df, *args, **kwargs)[0m
[1;32m     68[0m [38;5;28;01mdef[39;00m [38;5;21mdf_display[39m(df, [38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs):
[1;32m     69[0m [38;5;250m    [39m[38;5;124;03m"""[39;00m
[1;32m     70[0m [38;5;124;03m    df.display() is an alias for display(df). Run help(display) for more information.[39;00m
[1;32m     71[0m [38;5;124;03m    """[39;00m
[0;32m---> 72[0m     display(df, [38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs)

File [0;32m/databricks/python_shell/li

In [0]:
df_1.unionByName(df_3 , allowMissingColumns=True).display()

id,customer_name,city,country
201,Amit Verma,Delhi,India
202,Neha Sharma,Mumbai,India
203,Rahul Mehta,Bangalore,India
204,Pooja Singh,Hyderabad,India
205,Karan Malhotra,Chennai,India
206,Sanjay Patel,Ahmedabad,
207,Meena Iyer,Pune,
208,Vikram Desai,Kolkata,
209,Anita Nair,Kochi,
210,Rohit Kapoor,Jaipur,
