- To find out duplicate records
- To verify the mismatch of records between bronze, silver & gold

In [0]:
%fs ls dbfs:/FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/Emp_Hash-1.csv,Emp_Hash-1.csv,3312,1733110041000
dbfs:/FileStore/tables/Emp_Hash-2.csv,Emp_Hash-2.csv,6365,1733125960000
dbfs:/FileStore/tables/Emp_Hash-3.csv,Emp_Hash-3.csv,6385,1733126482000
dbfs:/FileStore/tables/Emp_Hash.csv,Emp_Hash.csv,3310,1733108841000
dbfs:/FileStore/tables/Flatten Nested Array.json,Flatten Nested Array.json,3756,1718618620000
dbfs:/FileStore/tables/Generate_Random_Data/,Generate_Random_Data/,0,0
dbfs:/FileStore/tables/InterviewQuestions/,InterviewQuestions/,0,0
dbfs:/FileStore/tables/MarketPrice.csv,MarketPrice.csv,19528,1719656208000
dbfs:/FileStore/tables/MultiLineJSON.json/,MultiLineJSON.json/,0,0
dbfs:/FileStore/tables/MultiLineJSON01.json/,MultiLineJSON01.json/,0,0


In [0]:
full_data_df = spark.read.csv("dbfs:/FileStore/tables/initload.csv", header=True, inferSchema=True)
display(full_data_df.limit(10))

Company_Name,Cust_Id,Cust_Name,Category,Start_Date,Start_Cust_Date,End_Date,Updated_Date,Cust_Value,Cust_Type,Exchange,Location,Last_Date_UTC,Cust_Category,Index
Sony,20,Naresh,Standard,3-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,30,STD,EUR,IND,1720000000000.0,SETTL,True
Sony,21,kamal,Standard,6-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,25,STD,EUR,IND,1720000000000.0,TOI,False
Sony,22,kajal,Standard,9-Feb-23,1730000000000.0,1730000000000.0,1730000000000.0,28,STD,EUR,IND,1720000000000.0,TOI,False
Sony,23,kiran,Standard,3-Jan-24,1730000000000.0,1730000000000.0,1730000000000.0,31,STD,EUR,IND,1720000000000.0,TOI,False
Sony,24,sam,Standard,8-Jan-24,1730000000000.0,1730000000000.0,1730000000000.0,34,STD,EUR,IND,1720000000000.0,TOI,False
Sony,25,sourab,Standard,9-Jan-24,1730000000000.0,1740000000000.0,1730000000000.0,37,STD,EUR,IND,1720000000000.0,TOI,True
Sony,26,jai,Upper,3-Mar-23,1730000000000.0,1740000000000.0,1730000000000.0,40,STD,EUR,IND,1720000000000.0,TOI,True
BPL,27,sree,Upper,6-Mar-23,1730000000000.0,1730000000000.0,1730000000000.0,43,STD,EUR,IND,1720000000000.0,SETTL,True
BPL,28,sreenath,Upper,9-Mar-23,1730000000000.0,1740000000000.0,1730000000000.0,46,STD,EUR,IND,1720000000000.0,SETTL,True
BPL,29,kamaesh,Upper,3-Jan-25,1740000000000.0,1740000000000.0,1730000000000.0,49,STD,EUR,IND,1720000000000.0,SETTL,False


In [0]:
full_data_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("path", "/user/hive/warehouse/bronze") \
    .saveAsTable("employee_bronze")

In [0]:
full_data_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("path", "/user/hive/warehouse/silver") \
    .saveAsTable("employee_silver")

In [0]:
full_data_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("path", "/user/hive/warehouse/gold") \
    .saveAsTable("employee_gold")

**Method 01**

In [0]:
%sql
SELECT COUNT(*) FROM employee_bronze;

count(1)
50


In [0]:
%sql
SELECT COUNT(*) FROM employee_silver;

count(1)
50


In [0]:
%sql
SELECT COUNT(*) FROM employee_gold;

count(1)
50


**Method 02**

     SELECT COUNT(*) FROM CityTable;
        => All records including NULL Values and Duplicate records

     SELECT COUNT(City) FROM CityTable;
        => All records including Duplicate records but Excludes NULL Values.
        => Counts all non-NULL values of City, including duplicates.
        => Ignores only the NULL values.

     SELECT COUNT(DISTINCT City) FROM CityTable;
        => Unique records and Exclude NULL Values.
        => Counts only unique (distinct) non-NULL values of City.
        => Removes duplicates before counting.
        => Also ignores NULL values.

In [0]:
%sql
SELECT COUNT(*), COUNT(Cust_Id), COUNT(DISTINCT Cust_Id) FROM employee_bronze;

count(1),count(Cust_Id),count(DISTINCT Cust_Id)
50,50,49


In [0]:
%sql
SELECT COUNT(*), COUNT(Cust_Id), COUNT(DISTINCT Cust_Id) FROM employee_silver;

count(1),count(Cust_Id),count(DISTINCT Cust_Id)
50,50,49


In [0]:
%sql
SELECT COUNT(*), COUNT(Cust_Id), COUNT(DISTINCT Cust_Id) FROM employee_gold;

count(1),count(Cust_Id),count(DISTINCT Cust_Id)
50,50,49


**Method 03**

     SELECT COUNT(*) FROM azure_dev.bronze_internal.products;

          azure_dev        =>  Name of catalog (bronze / silver / gold)
          bronze_internal  =>  Name of layer (bronze / silver / gold)
          products         =>  Name of table name                

In [0]:
%sql
SELECT 
  (SELECT COUNT(*) FROM employee_bronze) AS bronze_count,
  (SELECT COUNT(*) FROM employee_silver) AS silver_count,
  (SELECT COUNT(*) FROM employee_gold) AS gold_count;

bronze_count,silver_count,gold_count
50,50,50


**Method 04**

In [0]:
%sql
-- Bronze
SELECT 'Bronze' AS Category, COUNT(*) AS Total, COUNT(Cust_Id) AS Cust_Id_Count, COUNT(DISTINCT Cust_Id) AS Distinct_Cust_Id_Count
FROM employee_bronze
UNION ALL
-- Silver
SELECT 'Silver' AS Category, COUNT(*) AS Total, COUNT(Cust_Id) AS Cust_Id_Count, COUNT(DISTINCT Cust_Id) AS Distinct_Cust_Id_Count
FROM employee_silver
UNION ALL
-- Gold
SELECT 'Gold' AS Category, COUNT(*) AS Total, COUNT(Cust_Id) AS Cust_Id_Count, COUNT(DISTINCT Cust_Id) AS Distinct_Cust_Id_Count
FROM employee_gold

Category,Total,Cust_Id_Count,Distinct_Cust_Id_Count
Bronze,50,50,49
Silver,50,50,49
Gold,50,50,49
