In [0]:
%sql
-- cretae one db name as avd
-- but the flow is Metastore → Catalog → Schema → (Tables, Functions, Volumes)
-- we have cretaed volume name as raw_data inside this avd
-- and uploaded two files there

CREATE DATABASE IF NOT EXISTS avd;

In [0]:
%sql

-- Create a Delta table directly from the files we have uploaded into volume
-- Previously, we used a Volume → read into DataFrame → df.write.saveAsTable().
-- Now, we are creating the Delta table directly using SQL.

CREATE TABLE IF NOT EXISTS avd.emp(
  id INT,
  name STRING,
  city STRING,
  salary DOUBLE
)
USING DELTA

In [0]:
%sql 

-- Load data from the volume (employees_2.csv) into the avd.emp Delta table using COPY INTO.
-- Even though the table schema is already defined, CSV files store all values as strings.
-- COPY INTO does not automatically convert these string values to the table's data types.
-- Therefore, we explicitly cast each column (like id::integer, salary::double) 
-- to ensure the data matches the Delta table schema before loading.

COPY INTO avd.emp
FROM (
  SELECT
        id::integer,
        name::string,
        city::string,
        salary::double
  FROM "/Volumes/avd_workspace1/avd/raw_data/employees_2.csv")
FILEFORMAT = CSV
FORMAT_OPTIONS('header'='true')

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
0,0,0


In [0]:
%sql

select * from avd.emp

id,name,city,salary
1,A,DELHI,10000.0
2,B,MUMBAI,20000.0
3,C,CHENNAI,30000.0
4,D,PUNE,40000.0
5,E,BBSR,50000.0


In [0]:

%sql

-- Running the COPY INTO command again will not insert any new records.
-- This behavior is known as *idempotency*.
-- Idempotency means that even if you run the same command multiple times,
-- the result remains the same and no duplicate data is loaded.

COPY INTO avd.emp
FROM (
  SELECT
        id::integer,
        name::string,
        city::string,
        salary::double
  FROM "/Volumes/avd_workspace1/avd/raw_data/employees_2.csv")
FILEFORMAT = CSV
FORMAT_OPTIONS('header'='true')

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
0,0,0


In [0]:
%sql
-- what we have done above same commands we are doing for stating table 
-- like creat table add data into that tables 

CREATE TABLE IF NOT EXISTS avd.emp_stage(
  id INT,
  name STRING,
  city STRING,
  salary DOUBLE
)
USING DELTA

In [0]:
%sql

COPY INTO avd.emp_stage
FROM (
  SELECT
        id::integer,
        name::string,
        city::string,
        salary::double
  FROM "/Volumes/avd_workspace1/avd/raw_data/employees_staging.csv")
FILEFORMAT = CSV
FORMAT_OPTIONS('header'='true')

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
0,0,0


In [0]:
%sql 

select * from avd.emp_stage

id,name,city,salary
1,A,DELHI,10000.0
2,B,MUMBAI,20000.0
5,E,BBSR,50000.0
3,C,PUNE,50000.0
6,F,BBSR,60000.0


In [0]:
"""
In Databricks (and Delta Lake), MERGE INTO is a powerful command used to upsert data — meaning it can INSERT, UPDATE, or DELETE records in a Delta table based on a matching condition.

✅ What is MERGE INTO?

MERGE INTO is an ACID-compliant operation that allows you to combine data from a source dataset with a target Delta table.

It’s commonly used for:
    
Upserts (update existing + insert new)
Slowly Changing Dimensions (SCD) — Type 1 & Type 2
Incremental data loads
Change Data Capture (CDC) workflows
Deduplication
"""

'\nIn Databricks (and Delta Lake), MERGE INTO is a powerful command used to upsert data — meaning it can INSERT, UPDATE, or DELETE records in a Delta table based on a matching condition.\n\n✅ What is MERGE INTO?\n\nMERGE INTO is an ACID-compliant operation that allows you to combine data from a source dataset with a target Delta table.\n\nIt’s commonly used for:\n    \nUpserts (update existing + insert new)\nSlowly Changing Dimensions (SCD) — Type 1 & Type 2\nIncremental data loads\nChange Data Capture (CDC) workflows\nDeduplication\n'

In [0]:
"""
We have two tables:
1. emp        → target table  
2. emp_stage  → source table  

The source table receives new data every day.  
Our job is to use this daily data to update the target table.

Based on the id column, we decide whether to insert, update, or delete:

1. If an id exists in the source but not in the target → insert it into the target.
2. If an id exists in both source and target → update the target with the latest data from the source.
3. If an id exists in the target but not in the source → delete it from the target.

now for this we use MERGE INTO command
"""



'\nWe have two tables:\n1. emp        → target table  \n2. emp_stage  → source table  \n\nThe source table receives new data every day.  \nOur job is to use this daily data to update the target table.\n\nBased on the id column, we decide whether to insert, update, or delete:\n\n1. If an id exists in the source but not in the target → insert it into the target.\n2. If an id exists in both source and target → update the target with the latest data from the source.\n3. If an id exists in the target but not in the source → delete it from the target.\n'

In [0]:
%sql

MERGE INTO avd.emp as target 
USING avd.emp_stage as source
ON target.id = source.id

-- 2. If an id exists in both source and target → update the target with the latest data from the source.
--  for this we use WHEN MATCHED THEN UPDATE
WHEN MATCHED THEN
UPDATE 
      SET
        target.name = source.name,
        target.city = source.city,
        target.salary = source.salary

-- 1. If an id exists in the source but not in the target → insert it into the target.
-- for this we used WHEN NOT MATCHED THEN INSERT
WHEN NOT MATCHED THEN
INSERT (target.id,target.name,target.city,target.salary)
VALUES(source.id,source.name,source.city,source.salary)

--3. If an id exists in the target but not in the source → delete it from the target.
-- for this we use WHEN NOT MATCHED BY SOURCE THEN DELETE
WHEN NOT MATCHED BY SOURCE THEN 
DELETE

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
6,4,1,1


In [0]:
%sql
-- now we are seeing the changes
-- id 6 wast inserted 
-- id 4 was deleted 
-- and remaining is updated

select * from avd.emp

id,name,city,salary
1,A,DELHI,10000.0
2,B,MUMBAI,20000.0
5,E,BBSR,50000.0
3,C,PUNE,50000.0
6,F,BBSR,60000.0


In [0]:
"""
SCD — Slowly Changing Dimensions  
This means handling data that changes over time.

There are two types:
1. SCD Type 1
2. SCD Type 2

SCD Type 1:
- We do NOT maintain any history of old values.
- Whatever new data comes simply overwrites the existing data.
- In a MERGE command, SCD Type 1 is basically the same as normal upsert,
  but we REMOVE the 3rd step (the delete step).
  → Meaning: we do NOT delete records that are missing in the source table.
- Only insert and update happen.
- No history, no versioning — just keep the latest values.
- when we update table we lost the lod data -
"""

'\nSCD — Slowly Changing Dimensions  \nThis means handling data that changes over time.\n\nThere are two types:\n1. SCD Type 1\n2. SCD Type 2\n\nSCD Type 1:\n- We do NOT maintain any history of old values.\n- Whatever new data comes simply overwrites the existing data.\n- In a MERGE command, SCD Type 1 is basically the same as normal upsert,\n  but we REMOVE the 3rd step (the delete step).\n  → Meaning: we do NOT delete records that are missing in the source table.\n- Only insert and update happen.\n- No history, no versioning — just keep the latest values.\n'

In [0]:
%sql
-- we droping this table becasue we have to create same table once again for scdtype1 - in which we upload new data
drop table avd.emp_stage

In [0]:
%sql
-- what we have done above same commands we are doing for stating table 
-- like creat table add data into that tables 

CREATE TABLE IF NOT EXISTS avd.emp_stage(
  id INT,
  name STRING,
  city STRING,
  salary DOUBLE
)
USING DELTA

In [0]:
%sql

COPY INTO avd.emp_stage
FROM (
  SELECT
        id::integer,
        name::string,
        city::string,
        salary::double
-- here we use new data file
  FROM "/Volumes/avd_workspace1/avd/raw_data/employees_staging_SCD1.csv")
FILEFORMAT = CSV
FORMAT_OPTIONS('header'='true')

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
2,2,0


In [0]:
%sql 
-- drop table
drop table avd.emp

In [0]:
%sql
-- create emp table
CREATE TABLE IF NOT EXISTS avd.emp(
  id INT,
  name STRING,
  city STRING,
  salary DOUBLE
)
USING DELTA

In [0]:
%sql
-- load data into that table
COPY INTO avd.emp
FROM (
  SELECT
        id::integer,
        name::string,
        city::string,
        salary::double
  FROM "/Volumes/avd_workspace1/avd/raw_data/employees_2.csv")
FILEFORMAT = CSV
FORMAT_OPTIONS('header'='true')

num_affected_rows,num_inserted_rows,num_skipped_corrupt_files
5,5,0


In [0]:
%sql 
--now we impelement the SCD Type one 

MERGE INTO avd.emp as target
USING avd.emp_stage as source
ON target.id = source.id

WHEN MATCHED THEN 
UPDATE 
      SET
        target.name = source.name,
        target.city = source.city,
        target.salary = source.salary

WHEN NOT MATCHED THEN
INSERT(target.id,target.name,target.city,target.salary)
VALUES(source.id,source.name,source.city,source.salary)

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
2,1,0,1


In [0]:
%sql

select * from avd.emp

id,name,city,salary
1,A,DELHI,10000.0
2,B,MUMBAI,20000.0
4,D,PUNE,40000.0
5,E,BBSR,50000.0
3,C,PUNE,50000.0
6,F,BBSR,60000.0


In [0]:
%sql 
-- Our source and target tables are unchanged.
-- When we run the SCD Type 1 MERGE script again,
-- the output shows num_update_rows = 2.
-- But in reality, no data in the table has actually changed.
-- This means it is reporting “false updates” even though the values are the same.

MERGE INTO avd.emp as target
USING avd.emp_stage as source
ON target.id = source.id

WHEN MATCHED THEN 
UPDATE 
      SET
        target.name = source.name,
        target.city = source.city,
        target.salary = source.salary

WHEN NOT MATCHED THEN
INSERT(target.id,target.name,target.city,target.salary)
VALUES(source.id,source.name,source.city,source.salary)

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
2,2,0,0


In [0]:
%sql

-- To avoid false updates, we add a condition inside the MATCHED clause.
-- This ensures that an update happens ONLY when at least one column value
-- (name, city, or salary) is actually different between source and target.
-- now in output we are see all then values are zero - num of updated rows

MERGE INTO avd.emp AS target
USING avd.emp_stage AS source
ON target.id = source.id

-- Update only when IDs match AND any of the fields are different.
WHEN MATCHED AND (
        target.name   != source.name OR
        target.city   != source.city OR
        target.salary != source.salary
) THEN 
UPDATE SET
        target.name   = source.name,
        target.city   = source.city,
        target.salary = source.salary

-- Insert new rows when the ID does not exist in the target table.
WHEN NOT MATCHED THEN
INSERT (id, name, city, salary)
VALUES (source.id, source.name, source.city, source.salary);


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
0,0,0,0


In [0]:
# in above coe we have only three columns that is why we can do comparision but if number of colums is more 
# then we can not do compaririosn 
# for that one optimization we can do that 
# we create on has key using multiple colum

In [0]:
"""
SCD Type2 
"""

In [0]:
%sql 

drop table avd.target_tbl

In [0]:
%sql 

drop table avd.source_tbl

In [0]:
%sql
-- Step 1: Create Target Table
CREATE OR REPLACE TABLE avd.target_tbl (
  id INT,
  name STRING,
  city STRING,
  salary INT,
  start_date TIMESTAMP,
  end_date TIMESTAMP,
  active_status STRING
) USING DELTA;

In [0]:
%sql
-- Load initial target data
INSERT INTO avd.target_tbl (id, name, city, salary, start_date, end_date, active_status)
VALUES
  (1, 'A', 'DELHI', 10000, current_timestamp(), TIMESTAMP('9999-12-31 00:00:00'), 'Y'),
  (2, 'B', 'MUMBAI', 20000, current_timestamp(), TIMESTAMP('9999-12-31 00:00:00'), 'Y'),
  (3, 'C', 'CHENNAI', 30000, current_timestamp(), TIMESTAMP('9999-12-31 00:00:00'), 'Y'),
  (4, 'D', 'PUNE', 40000, current_timestamp(), TIMESTAMP('9999-12-31 00:00:00'), 'Y'),
  (5, 'E', 'BBSR', 50000, current_timestamp(), TIMESTAMP('9999-12-31 00:00:00'), 'Y');

num_affected_rows,num_inserted_rows
5,5


In [0]:
%sql
-- Step 2: Create Source Table
CREATE OR REPLACE TABLE avd.source_tbl (
  id INT,
  name STRING,
  city STRING,
  salary INT
) USING DELTA;

In [0]:
%sql
-- Load source data
INSERT INTO avd.source_tbl (id, name, city, salary)
VALUES
  (3, 'C', 'PUNE', 50000),
  (6, 'F', 'BBSR', 60000);

num_affected_rows,num_inserted_rows
2,2


In [0]:
%sql

-- applying SCD type 2

MERGE INTO avd.target_tbl AS target
USING avd.source_tbl as source
on target.id = source.id

WHEN MATCHED THEN
UPDATE
      SET
        target.end_date = current_timestamp,
        target.active_status = "N"

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
1,1,0,0


In [0]:
%sql

select * from avd.target_tbl

id,name,city,salary,start_date,end_date,active_status
1,A,DELHI,10000,2025-12-04T11:05:22.103097Z,9999-12-31T00:00:00Z,Y
2,B,MUMBAI,20000,2025-12-04T11:05:22.103097Z,9999-12-31T00:00:00Z,Y
4,D,PUNE,40000,2025-12-04T11:05:22.103097Z,9999-12-31T00:00:00Z,Y
5,E,BBSR,50000,2025-12-04T11:05:22.103097Z,9999-12-31T00:00:00Z,Y
3,C,CHENNAI,30000,2025-12-04T11:05:22.103097Z,2025-12-04T11:05:44.612108Z,N


In [0]:
%sql

MERGE INTO avd.target_tbl AS target
USING avd.source_tbl as source
on target.id = source.id and target.active_status = "Y"

WHEN NOT MATCHED THEN 
INSERT (id,name,city,salary,start_date, end_date, active_status)
VALUES (source.id,source.name,source.city,source.salary,current_timestamp(), timestamp('9999-12-31 00:00:00'), "Y")

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
2,0,0,2


In [0]:
%sql

select * from avd.target_tbl

id,name,city,salary,start_date,end_date,active_status
1,A,DELHI,10000,2025-12-04T11:05:22.103097Z,9999-12-31T00:00:00Z,Y
2,B,MUMBAI,20000,2025-12-04T11:05:22.103097Z,9999-12-31T00:00:00Z,Y
4,D,PUNE,40000,2025-12-04T11:05:22.103097Z,9999-12-31T00:00:00Z,Y
5,E,BBSR,50000,2025-12-04T11:05:22.103097Z,9999-12-31T00:00:00Z,Y
3,C,CHENNAI,30000,2025-12-04T11:05:22.103097Z,2025-12-04T11:05:44.612108Z,N
3,C,PUNE,50000,2025-12-04T11:06:14.370256Z,9999-12-31T00:00:00Z,Y
6,F,BBSR,60000,2025-12-04T11:06:14.370256Z,9999-12-31T00:00:00Z,Y


In [0]:
%sql

--above code written below in signle code

MERGE INTO avd.target_tbl AS target
USING avd.source_tbl AS source
ON target.id = source.id

WHEN MATCHED 
     AND target.active_status = "Y"
     AND (
           target.name   <> source.name
        OR target.city   <> source.city
        OR target.salary <> source.salary
     )
THEN 
    UPDATE SET
        target.end_date = current_timestamp(),
        target.active_status = "N"

WHEN NOT MATCHED 
THEN INSERT (
        id,
        name,
        city,
        salary,
        start_date,
        end_date,
        active_status
     )
     VALUES (
        source.id,
        source.name,
        source.city,
        source.salary,
        current_timestamp(),
        timestamp('9999-12-31 00:00:00'),
        "Y"
     );


Example 1 - SCD Type 2

In [0]:
%sql
CREATE OR REPLACE TABLE avd.stg_cust (
  id INT,
  name STRING,
  city STRING,
  salary INT
) USING DELTA;

In [0]:
%sql
INSERT INTO avd.stg_cust (id, name, city, salary) VALUES
(101, 'Raj',  'Delhi',  50000),
(202, 'Amit', 'Pune',   35000),
(303, 'Neha', 'Nashik', 42000);


num_affected_rows,num_inserted_rows
3,3


In [0]:
%sql
-- Step 1: Create Target Table
CREATE OR REPLACE TABLE avd.dim_customer (
  id INT,
  name STRING,
  city STRING,
  salary INT,
  start_date TIMESTAMP,
  end_date TIMESTAMP,
  active_status STRING
) USING DELTA;

In [0]:
%sql
INSERT INTO avd.dim_customer 
(id, name, city, salary, start_date, end_date, active_status) 
VALUES
(101, 'Raj',  'Mumbai', 50000, '2023-01-01', '9999-12-31', 'Y'),
(202, 'Amit', 'Pune',   35000, '2023-05-01', '9999-12-31', 'Y');


num_affected_rows,num_inserted_rows
2,2


In [0]:
%sql

MERGE INTO avd.dim_customer as t
USING avd.stg_cust as s 
on t.id = s.id 

WHEN MATCHED AND t.active_status = "Y" AND
( t.name <> s.name OR 
  t.city <> s.city OR
  t.salary <> s.salary
)
THEN UPDATE 
      SET
          t.end_date = current_timestamp(),
          t.active_status = "N"
when not matched then
insert (
  t.id, 
  t.name, 
  t.city, 
  t.salary, 
  t.start_date, 
  t.end_date, 
  t.active_status
)
values (
  s.id, 
  s.name, 
  s.city, 
  s.salary, 
  current_timestamp(), 
  timestamp('9999-12-31 00:00:00'), 
  "Y"
)

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
2,1,0,1


In [0]:
%sql 

select * from avd.dim_customer

id,name,city,salary,start_date,end_date,active_status
202,Amit,Pune,35000,2023-05-01T00:00:00Z,9999-12-31T00:00:00Z,Y
101,Raj,Mumbai,50000,2023-01-01T00:00:00Z,2025-12-04T12:29:20.411997Z,N
303,Neha,Nashik,42000,2025-12-04T12:29:20.411997Z,9999-12-31T00:00:00Z,Y
