#### Unit Test

**How to count Null's, Non-Null's, Blank, Non-Blank values**

     a) Count of NULL values
     b) Count of non-NULL values
     c) Count of blank (empty string) values
     d) Count of non-null and non-blank values
     e) Check for any of the specified columns is null's
     f) How to find the number of rows where all columns are NULL?
     g) How to count the number of NULL values in a column?
     h) All in one summary
     i) All in one summary using SUM(CASE … END)
     j) Grouping by status

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("OBJECT_ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("Customer_ID", IntegerType(), True),
    StructField("Change_Date", StringType(), True),
    StructField("Load_Date", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("description", StringType(), True),
    StructField("start_date_source", StringType(), True),
    StructField("start_date_bronze", StringType(), True)
])


In [0]:
data = [(583069, "Harish", None, 13681832, None, '2025-06-02', None, 'E-Mail', 1724256609000, None),
        (510102, "", "HR", 40685884, '2025-04-02T04:15:05Z', '2025-06-02', 'Finished', 'Internet', 1724256609000, None),
        (506654, "Basha", "", None, '2025-04-02T04:15:05Z', '2025-06-02', 'Not Relevant', 'Social Media', 1724256609000, None),
        (583195, None, "Finance", 12619703, None, '2025-06-02', 'Started', 'Messaging', 1724256609000, None),
        (470450, "Venky", "IT", 8541938, '2025-04-02T07:59:14Z', '2025-06-02', 'Not Relevant', 'IoT', 1724256609000, None),
        (558253, "", None, 2269299, None, '2025-06-02', 'Open', None, 1724256609000, None),
        (None, "Krishna", "Sales", None, '2025-04-02T06:12:18Z', '2025-06-02', None, 'Manual data entry', 1724256609000, None),
        (583181, "Kiran", "Marketing", 39714449, None, '2025-06-02', 'Finished', 'Other', 1724256609000, None),
        (583119, "Hitesh", None, 10183510, '2025-04-02T04:15:13Z', None, 'Open', 'Telephony', 1724256609000, None),
        (577519, "", "Accounts", None, '2025-04-02T08:27:50Z', '2025-06-02', 'Not Relevant', None, 1724256609000, None),
        (583151, "Sushma", "Accounts", 40442877, None, '2025-06-02', 'Open', 'Fax', 1724256609000, None),
        (583167, None, "Admin", 16474490, '2025-04-02T09:07:27Z', None, 'Not Relevant', 'Feedback', 1724256609000, None),
        (583162, "Buvan", "IT", 7447339, '2025-04-02T16:46:07Z', None, 'Finished', 'WorkZone', 1724256609000, None),
        (575216, "Mohan", "Admin", 17258071, '2025-04-02T01:51:03Z', '2025-06-02', 'Open', 'IOT', 1724256609000, None),
        (None, None, None, None, None, None, None, None, None, None),
        (583173, "Lohith", "Finance", 15113750, None, '2025-06-02', 'Finished', None, 1724256609000, None),
        (583099, "Loba", "Testing", 40505376, '2025-04-02T19:54:50Z', None, 'Started', None, 1724256609000, None)
       ]

df_dev = spark.createDataFrame(data, schema)
display(df_dev)

OBJECT_ID,Name,department,Customer_ID,Change_Date,Load_Date,Status,description,start_date_source,start_date_bronze
583069.0,Harish,,13681832.0,,2025-06-02,,E-Mail,1724256609000.0,
510102.0,,HR,40685884.0,2025-04-02T04:15:05Z,2025-06-02,Finished,Internet,1724256609000.0,
506654.0,Basha,,,2025-04-02T04:15:05Z,2025-06-02,Not Relevant,Social Media,1724256609000.0,
583195.0,,Finance,12619703.0,,2025-06-02,Started,Messaging,1724256609000.0,
470450.0,Venky,IT,8541938.0,2025-04-02T07:59:14Z,2025-06-02,Not Relevant,IoT,1724256609000.0,
558253.0,,,2269299.0,,2025-06-02,Open,,1724256609000.0,
,Krishna,Sales,,2025-04-02T06:12:18Z,2025-06-02,,Manual data entry,1724256609000.0,
583181.0,Kiran,Marketing,39714449.0,,2025-06-02,Finished,Other,1724256609000.0,
583119.0,Hitesh,,10183510.0,2025-04-02T04:15:13Z,,Open,Telephony,1724256609000.0,
577519.0,,Accounts,,2025-04-02T08:27:50Z,2025-06-02,Not Relevant,,1724256609000.0,


In [0]:
df_dev.write \
      .format("delta") \
      .mode("overwrite") \
      .option("path", "/user/hive/warehouse/bronze_Nulls") \
      .saveAsTable("tbl_NonNull_Nulls_Blank")

#### a) PySpark

**Null_Count:** Number of actual nulls

**NotNull_Count:** includes both blanks and real values

**Blank_Count:** number of empty strings ("")

**NonNull_And_Not_Blank** = NotNull_Count - Blank_Count

In [0]:
from pyspark.sql.functions import lit, count, when, col

def count_values(df, column):
    return df_dev.select(
        count(when(col(column).isNull(), column)).alias("Null_Count"),
        count(when(col(column).isNotNull(), column)).alias("NotNull_Count"),
        count(when(col(column) == "", column)).alias("Blank_Count"),
        count(lit(1)).alias("Total_Rows")
        )

In [0]:
display(count_values(df_dev, "Name"))
display(count_values(df_dev, "department"))

Null_Count,NotNull_Count,Blank_Count,Total_Rows
3,14,3,17


Null_Count,NotNull_Count,Blank_Count,Total_Rows
4,13,1,17


#### b) SQL

**a) How to check for NULL values in a column?**
- Use the `IS NULL` or `IS NOT NULL` condition in the WHERE clause.

In [0]:
%sql
SELECT COUNT(*)
FROM tbl_NonNull_Nulls_Blank
WHERE OBJECT_ID IS NULL;

count(1)
2


In [0]:
%sql
SELECT * FROM tbl_NonNull_Nulls_Blank
WHERE OBJECT_ID IS NULL;

OBJECT_ID,Name,department,Customer_ID,Change_Date,Load_Date,Status,description,start_date_source,start_date_bronze
,Krishna,Sales,,2025-04-02T06:12:18Z,2025-06-02,,Manual data entry,1724256609000.0,
,,,,,,,,,


In [0]:
%sql
SELECT COUNT(*) AS null_count
FROM tbl_NonNull_Nulls_Blank
WHERE Name IS NULL;

null_count
3


**b) How to check for Non-NULL values in a column?**

In [0]:
%sql
SELECT COUNT(*) AS non_null_count
FROM tbl_NonNull_Nulls_Blank
WHERE Name IS NOT NULL;

non_null_count
14


- COUNT(Name) **ignores NULLs**, so this gives count of **non-NULL** values.

In [0]:
%sql
SELECT COUNT(Name) AS non_null_count
FROM tbl_NonNull_Nulls_Blank;

non_null_count
14


**c) Count of blank (empty string) values**

**Note:**
- This applies to **string/text** columns.
- Also, in some databases (like **Oracle**), **empty strings** are treated as **NULL**, so this may not apply the same way.

In [0]:
%sql
SELECT COUNT(*) AS blank_count
FROM tbl_NonNull_Nulls_Blank
WHERE Name = '';

blank_count
3


**d) Count of non-null and non-blank values**

In [0]:
%sql
SELECT COUNT(*) AS non_null_non_blank_count
FROM tbl_NonNull_Nulls_Blank
WHERE Name IS NOT NULL AND Name <> '';

non_null_non_blank_count
11


**e) Check for any of the specified columns is null's**
- This query returns all rows from the table non_null_non_blank_count where **any of the specified columns** (id, name, department, or Age) have a **NULL** value.

- The **OR** operators mean that if **even one of these columns** is **NULL** in a row, that row will be selected.

In [0]:
%sql
SELECT * FROM tbl_NonNull_Nulls_Blank
WHERE OBJECT_ID IS NULL OR
      Name IS NULL OR
      department IS NULL OR
      Customer_ID IS NULL OR
      Change_Date IS NULL OR
      Load_Date IS NULL OR
      Status IS NULL OR
      description IS NULL;

OBJECT_ID,Name,department,Customer_ID,Change_Date,Load_Date,Status,description,start_date_source,start_date_bronze
,Krishna,Sales,,2025-04-02T06:12:18Z,2025-06-02,,Manual data entry,1724256609000.0,
583181.0,Kiran,Marketing,39714449.0,,2025-06-02,Finished,Other,1724256609000.0,
506654.0,Basha,,,2025-04-02T04:15:05Z,2025-06-02,Not Relevant,Social Media,1724256609000.0,
583195.0,,Finance,12619703.0,,2025-06-02,Started,Messaging,1724256609000.0,
583069.0,Harish,,13681832.0,,2025-06-02,,E-Mail,1724256609000.0,
583151.0,Sushma,Accounts,40442877.0,,2025-06-02,Open,Fax,1724256609000.0,
583167.0,,Admin,16474490.0,2025-04-02T09:07:27Z,,Not Relevant,Feedback,1724256609000.0,
583119.0,Hitesh,,10183510.0,2025-04-02T04:15:13Z,,Open,Telephony,1724256609000.0,
577519.0,,Accounts,,2025-04-02T08:27:50Z,2025-06-02,Not Relevant,,1724256609000.0,
558253.0,,,2269299.0,,2025-06-02,Open,,1724256609000.0,


**f) How to find the number of rows where all columns are NULL?**

In [0]:
%sql
SELECT *
-- SELECT COUNT(*) AS all_null_rows
FROM tbl_NonNull_Nulls_Blank
WHERE OBJECT_ID IS NULL AND
      Name IS NULL AND
      department IS NULL AND
      Customer_ID IS NULL AND
      Change_Date IS NULL AND
      Load_Date IS NULL AND
      Status IS NULL AND
      description IS NULL;

OBJECT_ID,Name,department,Customer_ID,Change_Date,Load_Date,Status,description,start_date_source,start_date_bronze
,,,,,,,,,


**g) How to count the number of NULL values in a column?**

In [0]:
%sql
SELECT COUNT(CASE WHEN description IS NULL THEN 1 END) AS NullCount_description
FROM tbl_NonNull_Nulls_Blank;

NullCount_description
5


**h) All in one summary**

**COUNT(*):**
- Counts all rows.

**your_column IS NULL:**
- Checks for NULL values.

**your_column = '':**
- Checks for **blank/empty strings** (only relevant for text/varchar fields).

**your_column IS NOT NULL AND your_column <> '':**
- Counts values that are **not NULL and not blank**.

In [0]:
%sql
SELECT 
    COUNT(*) AS Total_Count,
    COUNT(CASE WHEN Name IS NULL THEN 1 END) AS Null_Count,
    COUNT(CASE WHEN Name IS NOT NULL THEN 1 END) AS Non_Null_Count,
    COUNT(CASE WHEN Name = '' THEN 1 END) AS Blank_Count,
    COUNT(CASE WHEN Name IS NOT NULL AND Name <> '' THEN 1 END) AS valid_non_blank_count,
    (COUNT(CASE WHEN Name IS NOT NULL THEN 1 END) - 
     COUNT(CASE WHEN Name = '' THEN 1 END)) AS Blank_Count_Difference,
    (COUNT(CASE WHEN Name IS NOT NULL THEN 1 END) - 
     COUNT(CASE WHEN Name IS NULL THEN 1 END)) AS Null_Count_Difference
FROM tbl_NonNull_Nulls_Blank;

Total_Count,Null_Count,Non_Null_Count,Blank_Count,valid_non_blank_count,Blank_Count_Difference,Null_Count_Difference
17,3,14,3,11,11,11


**Expression A:**

     COUNT(CASE WHEN name IS NOT NULL AND name <> '' THEN 1 END) AS valid_non_blank_count

This counts all rows where:
- **name** is **not NULL**, and
- **name** is **not a blank string ('')**.

✅ Not NULL and Not blank → **"valid non-blank values"**



**Expression B:**

     (COUNT(CASE WHEN name IS NOT NULL THEN 1 END) - COUNT(CASE WHEN name = '' THEN 1 END)) AS Blank_Count_Difference

- Counts all **non-NULL** values: **COUNT(CASE WHEN name IS NOT NULL THEN 1 END)**
- Then subtracts the **number of blank strings** (regardless of NULL status): **COUNT(CASE WHEN name = '' THEN 1 END)**


In [0]:
%sql
SELECT 
    COUNT(*) AS Total_Count,
    COUNT(CASE WHEN department IS NULL THEN 1 END) AS Null_Count,
    COUNT(CASE WHEN department IS NOT NULL THEN 1 END) AS Non_Null_Count,
    COUNT(CASE WHEN department = '' THEN 1 END) AS Blank_Count,
    COUNT(CASE WHEN department IS NOT NULL AND department <> '' THEN 1 END) AS valid_non_blank_count,
    (COUNT(CASE WHEN department IS NOT NULL THEN 1 END) - 
     COUNT(CASE WHEN department = '' THEN 1 END)) AS Blank_Count_Difference,
    (COUNT(CASE WHEN department IS NOT NULL THEN 1 END) - 
     COUNT(CASE WHEN department IS NULL THEN 1 END)) AS Null_Count_Difference
FROM tbl_NonNull_Nulls_Blank;

Total_Count,Null_Count,Non_Null_Count,Blank_Count,valid_non_blank_count,Blank_Count_Difference,Null_Count_Difference
17,4,13,1,12,12,9


In [0]:
%sql
SELECT 'OBJECT_ID' AS Column_Name,
    COUNT(*) AS Total_Count,
    COUNT(CASE WHEN OBJECT_ID IS NULL THEN 1 END) AS Null_Count,
    COUNT(CASE WHEN OBJECT_ID IS NOT NULL THEN 1 END) AS Non_Null_Count,
    COUNT(CASE WHEN OBJECT_ID = '' THEN 1 END) AS Blank_Count,
    COUNT(CASE WHEN OBJECT_ID IS NOT NULL AND OBJECT_ID <> '' THEN 1 END) AS Valid_Non_Blank_Count,
    (COUNT(CASE WHEN OBJECT_ID IS NOT NULL THEN 1 END) - COUNT(CASE WHEN OBJECT_ID = '' THEN 1 END)) AS Blank_Count_Difference,
    (COUNT(CASE WHEN OBJECT_ID IS NOT NULL THEN 1 END) - COUNT(CASE WHEN OBJECT_ID IS NULL THEN 1 END)) AS Null_Count_Difference
FROM tbl_NonNull_Nulls_Blank

UNION ALL

SELECT 'Name',
    COUNT(*),
    COUNT(CASE WHEN Name IS NULL THEN 1 END),
    COUNT(CASE WHEN Name IS NOT NULL THEN 1 END),
    COUNT(CASE WHEN Name = '' THEN 1 END),
    COUNT(CASE WHEN Name IS NOT NULL AND Name <> '' THEN 1 END),
    (COUNT(CASE WHEN Name IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Name = '' THEN 1 END)),
    (COUNT(CASE WHEN Name IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Name IS NULL THEN 1 END))
FROM tbl_NonNull_Nulls_Blank

UNION ALL

SELECT 'department',
    COUNT(*),
    COUNT(CASE WHEN department IS NULL THEN 1 END),
    COUNT(CASE WHEN department IS NOT NULL THEN 1 END),
    COUNT(CASE WHEN department = '' THEN 1 END),
    COUNT(CASE WHEN department IS NOT NULL AND department <> '' THEN 1 END),
    (COUNT(CASE WHEN department IS NOT NULL THEN 1 END) - COUNT(CASE WHEN department = '' THEN 1 END)),
    (COUNT(CASE WHEN department IS NOT NULL THEN 1 END) - COUNT(CASE WHEN department IS NULL THEN 1 END))
FROM tbl_NonNull_Nulls_Blank

UNION ALL

SELECT 'Customer_ID',
    COUNT(*),
    COUNT(CASE WHEN Customer_ID IS NULL THEN 1 END),
    COUNT(CASE WHEN Customer_ID IS NOT NULL THEN 1 END),
    COUNT(CASE WHEN Customer_ID = '' THEN 1 END),
    COUNT(CASE WHEN Customer_ID IS NOT NULL AND Customer_ID <> '' THEN 1 END),
    (COUNT(CASE WHEN Customer_ID IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Customer_ID = '' THEN 1 END)),
    (COUNT(CASE WHEN Customer_ID IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Customer_ID IS NULL THEN 1 END))
FROM tbl_NonNull_Nulls_Blank

UNION ALL

SELECT 'Change_Date',
    COUNT(*),
    COUNT(CASE WHEN Change_Date IS NULL THEN 1 END),
    COUNT(CASE WHEN Change_Date IS NOT NULL THEN 1 END),
    COUNT(CASE WHEN Change_Date = '' THEN 1 END),
    COUNT(CASE WHEN Change_Date IS NOT NULL AND Change_Date <> '' THEN 1 END),
    (COUNT(CASE WHEN Change_Date IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Change_Date = '' THEN 1 END)),
    (COUNT(CASE WHEN Change_Date IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Change_Date IS NULL THEN 1 END))
FROM tbl_NonNull_Nulls_Blank

UNION ALL

SELECT 'Load_Date',
    COUNT(*),
    COUNT(CASE WHEN Load_Date IS NULL THEN 1 END),
    COUNT(CASE WHEN Load_Date IS NOT NULL THEN 1 END),
    COUNT(CASE WHEN Load_Date = '' THEN 1 END),
    COUNT(CASE WHEN Load_Date IS NOT NULL AND Load_Date <> '' THEN 1 END),
    (COUNT(CASE WHEN Load_Date IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Load_Date = '' THEN 1 END)),
    (COUNT(CASE WHEN Load_Date IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Load_Date IS NULL THEN 1 END))
FROM tbl_NonNull_Nulls_Blank

UNION ALL

SELECT 'Status',
    COUNT(*),
    COUNT(CASE WHEN Status IS NULL THEN 1 END),
    COUNT(CASE WHEN Status IS NOT NULL THEN 1 END),
    COUNT(CASE WHEN Status = '' THEN 1 END),
    COUNT(CASE WHEN Status IS NOT NULL AND Status <> '' THEN 1 END),
    (COUNT(CASE WHEN Status IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Status = '' THEN 1 END)),
    (COUNT(CASE WHEN Status IS NOT NULL THEN 1 END) - COUNT(CASE WHEN Status IS NULL THEN 1 END))
FROM tbl_NonNull_Nulls_Blank

UNION ALL

SELECT 'description',
    COUNT(*),
    COUNT(CASE WHEN description IS NULL THEN 1 END),
    COUNT(CASE WHEN description IS NOT NULL THEN 1 END),
    COUNT(CASE WHEN description = '' THEN 1 END),
    COUNT(CASE WHEN description IS NOT NULL AND description <> '' THEN 1 END),
    (COUNT(CASE WHEN description IS NOT NULL THEN 1 END) - COUNT(CASE WHEN description = '' THEN 1 END)),
    (COUNT(CASE WHEN description IS NOT NULL THEN 1 END) - COUNT(CASE WHEN description IS NULL THEN 1 END))
FROM tbl_NonNull_Nulls_Blank;


Column_Name,Total_Count,Null_Count,Non_Null_Count,Blank_Count,Valid_Non_Blank_Count,Blank_Count_Difference,Null_Count_Difference
OBJECT_ID,17,2,15,0,0,15,13
Name,17,3,14,3,11,11,11
department,17,4,13,1,12,12,9
Customer_ID,17,4,13,0,0,13,9
Change_Date,17,7,10,0,10,10,3
Load_Date,17,5,12,0,12,12,7
Status,17,3,14,0,14,14,11
description,17,5,12,0,12,12,7


**g) All in one summary using SUM(CASE … END)**

In [0]:
%sql
SELECT
  SUM(CASE WHEN department IS NULL THEN 1 ELSE 0 END) AS num_null,
  SUM(CASE WHEN department = '' THEN 1 ELSE 0 END) AS num_blank,
  SUM(CASE WHEN department IS NOT NULL 
            AND department <> '' THEN 1 ELSE 0 END) AS num_non_null_non_blank
FROM tbl_NonNull_Nulls_Blank;

num_null,num_blank,num_non_null_non_blank
4,1,12


**h) Grouping by status**

In [0]:
%sql
SELECT
  CASE
    WHEN Name IS NULL THEN 'NULL'
    WHEN Name = '' THEN 'BLANK'
    ELSE 'NON-NULL/NON-BLANK'
  END AS status,
  COUNT(*) AS cnt
FROM tbl_NonNull_Nulls_Blank
GROUP BY
  CASE
    WHEN Name IS NULL THEN 'NULL'
    WHEN Name = '' THEN 'BLANK'
    ELSE 'NON-NULL/NON-BLANK'
  END;

status,cnt
NON-NULL/NON-BLANK,11
,3
BLANK,3
