- Title: Union DataFrames in Spark
- Slug: spark-dataframe-union
- Date: 2019-12-20
- Category: Computer Science
- Tags: programming, Scala, Spark, DataFrame, union
- Author: Ben Du

In [None]:
%%classpath add mvn
org.apache.spark spark-core_2.11 2.3.1
org.apache.spark spark-sql_2.11 2.3.1

In [None]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession
    .builder()
    .master("local[2]")
    .appName("Spark DataFrame Union")
    .getOrCreate()
import spark.implicits._

In [None]:
val df1 = Seq(
    (1L, "a", "foo", 3.0),
    (2L, "b", "bar", 4.0),
    (3L, "c", "foo", 5.0),
    (4L, "d", "bar", 7.0)
).toDF("col1", "col2", "col3", "col4")
df.show

In [None]:
val df2 = df1.filter($"col1" <= 2)
df2.show

In [None]:
df1.union(df2).show

In [None]:
val df3 = df1.filter($"col1" > 2)
df3.show

In [None]:
Seq(df1, df2, df3).reduce(_ union _).show

In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

val spark = SparkSession.builder()
    .master("local[2]")
    .appName("Spark Union Example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()

import spark.implicits._

org.apache.spark.sql.SparkSession$implicits$@3fcbf20

In [3]:
val df1 = Seq(
    (1, "hello"),
    (2, "world")
).toDF("freq", "word")
df1.show

+----+-----+
|freq| word|
+----+-----+
|   1|hello|
|   2|world|
+----+-----+



null

In [4]:
val df2 = Seq(
    ("how", 1000),
    ("are", 300),
    ("you", 100)
).toDF("word", "freq")
df2.show

+----+----+
|word|freq|
+----+----+
| how|1000|
| are| 300|
| you| 100|
+----+----+



null

In [6]:
Seq(df1, df2).reduce(_ unionByName _).show

+----+-----+
|freq| word|
+----+-----+
|   1|hello|
|   2|world|
|1000|  how|
| 300|  are|
| 100|  you|
+----+-----+



In [7]:
Seq(df1).reduce(_ unionByName _).show

+----+-----+
|freq| word|
+----+-----+
|   1|hello|
|   2|world|
+----+-----+



## Comment 

`union` relies on column order rather than column names. 
This is the same as in SQL.
For columns that the type don't match, 
the super type is used.
This is really dangerous if you are careful. 
It is suggested that you define a function call unionByName to hanle this.
```
def unionByName(df1, df2):
    ...
```

In [5]:
val df3 = df1.union(df2)
df3.show

+----+-----+
|freq| word|
+----+-----+
|   1|hello|
|   2|world|
| how| 1000|
| are|  300|
| you|  100|
+----+-----+



null

In [6]:
df3.schema

[[StructField(freq,StringType,true), StructField(word,StringType,true)]]

In [8]:
spark.union(Seq(df1, df2))

<console>: 104

## Tips

A way to avoid the ordering issue is to select columns
to make sure that columns of the 2 DataFrames have the same ordering.

In [15]:
df2.select(df1.columns.map(c => col(c)): _*).show

+----+----+
|freq|word|
+----+----+
|1000| how|
| 300| are|
| 100| you|
+----+----+



## Comment

An exception is raised if the numbers of columns of the 2 DataFrames do not match.

In [25]:
val df1 = Seq(
    (1, "hello"),
    (2, "world")
).toDF("freq", "word")
df1.show

+----+-----+
|freq| word|
+----+-----+
|   1|hello|
|   2|world|
+----+-----+



df1 = [freq: int, word: string]


[freq: int, word: string]

In [26]:
val df2 = Seq(
    ("how", 1000, 0),
    ("are", 300, 0),
    ("you", 100, 0)
).toDF("word", "freq", "group")
df2.show

+----+----+-----+
|word|freq|group|
+----+----+-----+
| how|1000|    0|
| are| 300|    0|
| you| 100|    0|
+----+----+-----+



df2 = [word: string, freq: int ... 1 more field]


[word: string, freq: int ... 1 more field]

In [27]:
df1.union(df2)

Name: org.apache.spark.sql.AnalysisException
Message: Union can only be performed on tables with the same number of columns, but the first table has 2 columns and the second table has 3 columns;;
'Union
:- Project [_1#156 AS freq#159, _2#157 AS word#160]
:  +- LocalRelation [_1#156, _2#157]
+- Project [_1#171 AS word#175, _2#172 AS freq#176, _3#173 AS group#177]
   +- LocalRelation [_1#171, _2#172, _3#173]

StackTrace: 'Union
:- Project [_1#156 AS freq#159, _2#157 AS word#160]
:  +- LocalRelation [_1#156, _2#157]
+- Project [_1#171 AS word#175, _2#172 AS freq#176, _3#173 AS group#177]
   +- LocalRelation [_1#171, _2#172, _3#173]
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:39)
  at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:91)
  at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$13.apply(CheckAnalysis.scala:318)
  at org.apache.spark.sql.catalyst.analysis.C

## References

https://stackoverflow.com/questions/37612622/spark-unionall-multiple-dataframes

https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/sql/Dataset.html

https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/sql/functions.html

https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/Row.html