# Merge

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable


conf = SparkConf()

conf.setAppName("Sample Merge")
conf.set("spark.hadoop.fs.s3a.endpoint", "http://172.21.121.140:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "chapolin")
conf.set("spark.hadoop.fs.s3a.secret.key", "mudar@123")
conf.set("spark.hadoop.fs.s3a.path.style.access", True)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
conf.set("hive.metastore.uris", "thrift://metastore:9083")

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

## Create dataframes

### Data for insert

In [3]:
data_insert = [
    ("Product A", 100),
    ("Product B", 200),
    ("Product C", 700)
]

### Data for update

In [4]:
data_update = [
    ("Product A", 170),
    ("Product D", 777)
]

### Add schema

In [6]:
schema = StructType([
    StructField("product_name", StringType(), True),
    StructField("price", IntegerType(), True)
])

In [10]:
df_insert = spark.createDataFrame(data_insert, schema)
df_update = spark.createDataFrame(data_update, schema)

### DF that will insert

In [11]:
df_insert.show()

+------------+-----+
|product_name|price|
+------------+-----+
|   Product A|  100|
|   Product B|  200|
|   Product C|  700|
+------------+-----+



In [None]:
DF that will update

In [12]:
df_update.show()

+------------+-----+
|product_name|price|
+------------+-----+
|   Product A|  170|
|   Product D|  777|
+------------+-----+



In [13]:
df_insert.write.format("delta").mode("append").save('s3a://bronze/tb_merge')

In [15]:
df = spark.read.format("delta").load('s3a://bronze/tb_merge').show()

+------------+-----+
|product_name|price|
+------------+-----+
|   Product C|  700|
|   Product A|  100|
|   Product B|  200|
+------------+-----+



In [16]:
table_path = 's3a://bronze/tb_merge'
delta_table = DeltaTable.forPath(spark, table_path)

## Apply merge

In [18]:
(delta_table.alias("target")
     .merge(
         df_update.alias("source"),
         "target.product_name = source.product_name"
     )
      .whenMatchedUpdate(set={"price": "source.price"})
      .whenNotMatchedInsert(values={"product_name": "source.product_name", "price": "source.price"})
      .execute()) 

In [19]:
df = spark.read.format("delta").load('s3a://bronze/tb_merge').show()

+------------+-----+
|product_name|price|
+------------+-----+
|   Product C|  700|
|   Product B|  200|
|   Product A|  170|
|   Product D|  777|
+------------+-----+

