# Joining and Appending DataFrames

In [1]:
# import findspark
# findspark.init()

import pyspark 
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JoinAndAggregatingData").getOrCreate()
spark

## Generate play data

In [2]:
valuesP = [("koala", 1, "yes"), ("caterpillar", 2, "yes"), ("deer", 3, "yes"), ("human", 4, "yes")]
eats_plants = spark.createDataFrame(valuesP, ["name", "id", "eats_plants"])
eats_plants.show()

+-----------+---+-----------+
|       name| id|eats_plants|
+-----------+---+-----------+
|      koala|  1|        yes|
|caterpillar|  2|        yes|
|       deer|  3|        yes|
|      human|  4|        yes|
+-----------+---+-----------+



In [3]:
valuesM = [("shark", 5, "yes"), ("lion", 6, "yes"), ("tiger", 7, "yes"), ("human", 4, "yes")]
eats_meat = spark.createDataFrame(valuesM, ["name", "id", "eats_meat"])
eats_meat.show()

+-----+---+---------+
| name| id|eats_meat|
+-----+---+---------+
|shark|  5|      yes|
| lion|  6|      yes|
|tiger|  7|      yes|
|human|  4|      yes|
+-----+---+---------+



## Appends

In [4]:
# Append using the Union function
new_df = eats_plants
df_concat = eats_plants.union(new_df)

In [5]:
eats_plants.show()
print(("eats_plants df counts:", eats_plants.count(), len(eats_plants.columns)))

+-----------+---+-----------+
|       name| id|eats_plants|
+-----------+---+-----------+
|      koala|  1|        yes|
|caterpillar|  2|        yes|
|       deer|  3|        yes|
|      human|  4|        yes|
+-----------+---+-----------+

('eats_plants df counts:', 4, 3)


In [6]:
df_concat.show()
print(("df_concat counts:", df_concat.count(), len(df_concat.columns)))

+-----------+---+-----------+
|       name| id|eats_plants|
+-----------+---+-----------+
|      koala|  1|        yes|
|caterpillar|  2|        yes|
|       deer|  3|        yes|
|      human|  4|        yes|
|      koala|  1|        yes|
|caterpillar|  2|        yes|
|       deer|  3|        yes|
|      human|  4|        yes|
+-----------+---+-----------+

('df_concat counts:', 8, 3)


## Inner Join

Inner joins get us only the values that appear in both tables.

In [7]:
inner_join = eats_plants.join(eats_meat, ["name", "id"], "inner")
inner_join.show()

+-----+---+-----------+---------+
| name| id|eats_plants|eats_meat|
+-----+---+-----------+---------+
|human|  4|        yes|      yes|
+-----+---+-----------+---------+



## Left Joins

Left joins gets the values that appear in the left table and nothing additional from the right table except for its columns.

In [8]:
left_join = eats_plants.join(eats_meat, ["name", "id"], how='left')
left_join.show()

+-----------+---+-----------+---------+
|       name| id|eats_plants|eats_meat|
+-----------+---+-----------+---------+
|       deer|  3|        yes|     null|
|      human|  4|        yes|      yes|
|      koala|  1|        yes|     null|
|caterpillar|  2|        yes|     null|
+-----------+---+-----------+---------+



## Conditional Joins

Conditional Joins have some additional logic that was not encompassed in the underlying join.

In [9]:
conditional_join = eats_plants.join(eats_meat, ["name", "id"], how='left').filter(eats_meat.name.isNull())
conditional_join.show()

+-----------+---+-----------+---------+
|       name| id|eats_plants|eats_meat|
+-----------+---+-----------+---------+
|       deer|  3|        yes|     null|
|      koala|  1|        yes|     null|
|caterpillar|  2|        yes|     null|
+-----------+---+-----------+---------+



## Right Join

A right join gets the values that appear in the right table but not in the left.

In [10]:
right_join = eats_plants.join(eats_meat,  ["name", "id"], how='right')
right_join.show()

+-----+---+-----------+---------+
| name| id|eats_plants|eats_meat|
+-----+---+-----------+---------+
|shark|  5|       null|      yes|
|human|  4|        yes|      yes|
|tiger|  7|       null|      yes|
| lion|  6|       null|      yes|
+-----+---+-----------+---------+



## Full Outer Joins

Full outer joins will get all values from both tables.

In [11]:
full_outer_join = eats_plants.join(eats_meat, ["name", "id"], how='full')
full_outer_join.show()

+-----------+---+-----------+---------+
|       name| id|eats_plants|eats_meat|
+-----------+---+-----------+---------+
|       deer|  3|        yes|     null|
|      shark|  5|       null|      yes|
|      human|  4|        yes|      yes|
|      tiger|  7|       null|      yes|
|       lion|  6|       null|      yes|
|      koala|  1|        yes|     null|
|caterpillar|  2|        yes|     null|
+-----------+---+-----------+---------+

