# Data Quality Assessment 

In [2]:
# necessary to import db_connector script
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add project root to sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [6]:
# import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from db_connector import load_from_excel

In [9]:
data = load_from_excel()

In [7]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

In [8]:
# Defines data quality assessment function:

def assess_data_quality(df, table_name):
    print(f"\n{'='*30} {table_name.upper()} DATA QUALITY {'='*30}")

    # 1. Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"\n1. Duplicates: {duplicates} ({duplicates/len(df)*100:.2f}%)")
    
    # 2. Check for missing values
    missing = df.isnull().sum()
    missing_pct = df.isnull().sum() / len(df) * 100
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Percentage': missing_pct
    })
    print("\n2. Missing Values:")
    display(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Values', ascending=False))
    
    # 3. Check data types
    print("\n3. Data Types:")
    display(pd.DataFrame(df.dtypes, columns=['Data Type']))
    
    # 4. Specific checks based on table
    print("\n4. Specific Quality Checks:")

    if table_name == 'requisitions':
        # a. Check for requisitions with close date before open date
        if 'OPEN_DATE' in df.columns and 'CLOSE_DATE' in df.columns:
            invalid_dates = df[df['CLOSE_DATE'].notna() & (df['CLOSE_DATE'] < df['OPEN_DATE'])]
            print(f"   - Requisitions with close date before open date: {len(invalid_dates)}")
            if len(invalid_dates) > 0:
                display(invalid_dates.head())
        
        # b. Check for unusual number of openings
        if 'NUMBER_OF_OPENINGS' in df.columns:
            unusual_openings = df[df['NUMBER_OF_OPENINGS'] > 10]  
            print(f"   - Requisitions with more than 10 openings: {len(unusual_openings)}")
            if len(unusual_openings) > 0:
                value_counts = df['NUMBER_OF_OPENINGS'].value_counts().sort_index()
                display(value_counts)
            
    elif table_name == 'candidate':
        # a. Check for candidates with status dates out of order
        date_cols = [col for col in df.columns if 'DATE' in col and col != 'LAST_MODIFIED_DATE']
        for i in range(len(date_cols)-1):
            for j in range(i+1, len(date_cols)):
                col1, col2 = date_cols[i], date_cols[j]
                invalid_dates = df[(df[col1].notna()) & (df[col2].notna()) & (df[col2] < df[col1])]
                if len(invalid_dates) > 0:
                    print(f"   - Records with {col2} before {col1}: {len(invalid_dates)}")
        
        # b. Check for missing candidate IDs
        if 'CANDIDATE_ID' in df.columns:
            missing_ids = df[df['CANDIDATE_ID'].isna()]
            print(f"   - Records with missing candidate IDs: {len(missing_ids)}")
            
        # c. Check for invalid statuses
        if 'CANDIDATE_HISTORICAL_STATUS' in df.columns and 'candidate_status' in data:
            valid_statuses = set(data['candidate_status']['CANDIDATE_HISTORICAL_STATUS'])
            invalid_statuses = df[~df['CANDIDATE_HISTORICAL_STATUS'].isin(valid_statuses)]
            print(f"   - Records with invalid status values: {len(invalid_statuses)}")
            if len(invalid_statuses) > 0:
                display(invalid_statuses['CANDIDATE_HISTORICAL_STATUS'].value_counts())
    
    elif table_name == 'department':
        # a. Check for departments that are their own parent
        if 'DEPARTMENT_ID' in df.columns and 'PARENT_DEPARTMENT_ID' in df.columns:
            self_parent = df[df['DEPARTMENT_ID'] == df['PARENT_DEPARTMENT_ID']]
            print(f"   - Departments that are their own parent: {len(self_parent)}")
            if len(self_parent) > 0:
                display(self_parent.head())
            
        # b. Check for consistency in naming patterns
        if 'DEPARTMENT_NAME' in df.columns:
            missing_sd = df[~df['DEPARTMENT_NAME'].str.contains('- SD')]
            print(f"   - Departments without '- SD' in name: {len(missing_sd)}")
            if len(missing_sd) > 0:
                display(missing_sd['DEPARTMENT_NAME'].head())
                
            
    elif table_name == 'candidate_status':
        # Check for duplicate status values
        if 'CANDIDATE_HISTORICAL_STATUS' in df.columns:
            dup_status = df['CANDIDATE_HISTORICAL_STATUS'].duplicated().sum()
            print(f"   - Duplicate status values: {dup_status}")
            
        # Check for missing stage mappings
        if 'CANDIDATE_STAGE' in df.columns:
            missing_stage = df[df['CANDIDATE_STAGE'].isna()]
            print(f"   - Statuses without stage mapping: {len(missing_stage)}")

In [10]:
# Each pipeline should have a logical order of candidate statuses 
# e.g. New Submission -> In Review -> Interview -> Offer -> Hired
# Or: New Submission -> In Review -> Rejected
# NOT: Interview -> New Submission -> Rejected -> Offer

def check_candidate_status_logic(df):
    print("\n" + "="*30 + " CANDIDATE STATUS LOGIC CHECKS " + "="*30)

    # First, ensure we have datetime for status dates
    if 'HISTORICAL_STATUS_START_DATE' in df.columns:
        if df['HISTORICAL_STATUS_START_DATE'].dtype != 'datetime64[ns]':
            df['HISTORICAL_STATUS_START_DATE'] = pd.to_datetime(df['HISTORICAL_STATUS_START_DATE'], errors='coerce')

    # Group by requisition and candidate to analyse each pipeline
    pipeline_groups = df.groupby(['REQUISITION_ID', 'CANDIDATE_ID'])

    # Get the last status for each pipeline
    last_statuses = pipeline_groups.apply(lambda g: g.sort_values('HISTORICAL_STATUS_START_DATE').iloc[-1])

    # Check distribution of final statuses
    print("\nDistribution of final candidate statuses:")
    final_status_counts = last_statuses['CANDIDATE_HISTORICAL_STATUS'].value_counts()
    display(final_status_counts)

    # Identify potentially problematic final statuses
    expected_final_statuses = ['Hired', 'Rejected', 'Closed']
    unexpected_final = last_statuses[~last_statuses['CANDIDATE_HISTORICAL_STATUS'].isin(expected_final_statuses)]
    
    print(f"\nPipelines not ending with expected final status (Hired/Rejected/Closed): {len(unexpected_final)} ({len(unexpected_final)/len(last_statuses)*100:.2f}%)")
    
    if len(unexpected_final) > 0:
        print("\nTop unusual final statuses:")
        display(unexpected_final['CANDIDATE_HISTORICAL_STATUS'].value_counts().head(10))
        
        print("\nSample of pipelines with unusual final status:")
        display(unexpected_final.head())

In [12]:
check_candidate_status_logic(data['candidate'])



Distribution of final candidate statuses:


  last_statuses = pipeline_groups.apply(lambda g: g.sort_values('HISTORICAL_STATUS_START_DATE').iloc[-1])


CANDIDATE_HISTORICAL_STATUS
Closed               156967
Hired                  4513
In Review              4236
New Submission         2174
Assessment Centre       186
Rejected                119
Pre Offer xxx           105
First Interview          73
Phone Interview          72
Second Interview         52
Offer                    37
Interview                 7
Right to Work             5
Name: count, dtype: int64


Pipelines not ending with expected final status (Hired/Rejected/Closed): 6947 (4.12%)

Top unusual final statuses:


CANDIDATE_HISTORICAL_STATUS
In Review            4236
New Submission       2174
Assessment Centre     186
Pre Offer xxx         105
First Interview        73
Phone Interview        72
Second Interview       52
Offer                  37
Interview               7
Right to Work           5
Name: count, dtype: int64


Sample of pipelines with unusual final status:


Unnamed: 0_level_0,Unnamed: 1_level_0,REQUISITION_ID,PIPELINE_ID,SUBMISSION_DATE,CANDIDATE_ID,SUBMISSION_SOURCE,CANDIDATE_HISTORICAL_STATUS,HISTORICAL_STATUS_START_DATE,HISTORICAL_STATUS_END_DATE,LAST_MODIFIED_DATE
REQUISITION_ID,CANDIDATE_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
219020,4542201.0,219020,5551981,2023-01-02 10:20:16,4542201.0,URL_p_Indeed Organic,Offer,2023-01-31 09:47:52,2023-01-31 12:01:10,2023-01-31
250080,6239776.0,250080,6293748,2023-04-14 03:07:48,6239776.0,URL_p_Indeed Organic,New Submission,2023-04-14 03:07:48,NaT,2023-04-14
256021,4689810.0,256021,5662448,2023-01-11 19:58:13,4689810.0,URL_p_Indeed Organic,Pre Offer xxx,2023-01-12 14:22:37,NaT,2023-01-12
325363,5629238.0,325363,6072071,2023-03-08 17:05:33,5629238.0,URL_p_Indeed Organic,New Submission,2023-03-08 17:05:33,NaT,2023-03-08
330742,4791189.0,330742,5726903,2023-01-20 22:53:03,4791189.0,Superdrug - Mobile Friendly - 050918,New Submission,2023-01-20 22:53:03,NaT,2023-01-20
