# Data Quality Assessment 

In [2]:
# necessary to import db_connector script
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add project root to sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [6]:
# import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from db_connector import load_from_excel

In [9]:
data = load_from_excel()

In [7]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

In [8]:
# Defines data quality assessment function:

def assess_data_quality(df, table_name):
    print(f"\n{'='*30} {table_name.upper()} DATA QUALITY {'='*30}")

    df = df.copy()

    # Convert date columns to datetime at the beginning
    date_cols = [col for col in df.columns if 'DATE' in col]
    for col in date_cols:
        if df[col].dtype != 'datetime64[ns]':
            df[col] = pd.to_datetime(df[col], errors='coerce')
            print(f"Converted {col} to datetime")

    # 1. Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"\n1. Duplicates: {duplicates} ({duplicates/len(df)*100:.2f}%)")
    
    # 2. Check for missing values
    missing = df.isnull().sum()
    missing_pct = df.isnull().sum() / len(df) * 100
    missing_info = pd.DataFrame({
        'Missing Values': missing,
        'Percentage': missing_pct
    })
    print("\n2. Missing Values:")
    display(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Values', ascending=False))
    
    # 3. Check data types
    print("\n3. Data Types:")
    display(pd.DataFrame(df.dtypes, columns=['Data Type']))
    
    # 4. Specific checks based on table
    print("\n4. Specific Quality Checks:")

    if table_name == 'requisitions':
        # a. Check for requisitions with close date before open date
        if 'OPEN_DATE' in df.columns and 'CLOSE_DATE' in df.columns:
            invalid_dates = df[df['CLOSE_DATE'].notna() & (df['CLOSE_DATE'] < df['OPEN_DATE'])]
            print(f"   - Requisitions with close date before open date: {len(invalid_dates)}")
            if len(invalid_dates) > 0:
                display(invalid_dates.head())
        
        # b. Check for unusual number of openings
        if 'NUMBER_OF_OPENINGS' in df.columns:
            unusual_openings = df[df['NUMBER_OF_OPENINGS'] > 10]  
            print(f"   - Requisitions with more than 10 openings: {len(unusual_openings)}")
            if len(unusual_openings) > 0:
                value_counts = df['NUMBER_OF_OPENINGS'].value_counts().sort_index()
                display(value_counts)

            # outlier detection for NUMBER_OF_OPENINGS
            print("\n   - Outlier detection for NUMBER_OF_OPENINGS:")
            q1 = df['NUMBER_OF_OPENINGS'].quantile(0.25)
            q3 = df['NUMBER_OF_OPENINGS'].quantile(0.75)
            iqr = q3 - q1
            upper_bound = q3 + 1.5 * iqr
            
            outliers = df[df['NUMBER_OF_OPENINGS'] > upper_bound]
            print(f"     Outliers (>{upper_bound:.1f} openings): {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")
            
        # c. Check for time-to-fill outliers 
        if 'OPEN_DATE' in df.columns and 'CLOSE_DATE' in df.columns:
            # Convert to datetime if needed
            if df['OPEN_DATE'].dtype != 'datetime64[ns]':
                df['OPEN_DATE'] = pd.to_datetime(df['OPEN_DATE'], errors='coerce')
            if df['CLOSE_DATE'].dtype != 'datetime64[ns]':
                df['CLOSE_DATE'] = pd.to_datetime(df['CLOSE_DATE'], errors='coerce')
                
            # Calculate time to fill for closed requisitions
            closed_reqs = df[df['CLOSE_DATE'].notna()].copy()
            if not closed_reqs.empty:
                closed_reqs['time_to_fill'] = (closed_reqs['CLOSE_DATE'] - closed_reqs['OPEN_DATE']).dt.days
                
                # Check for outliers
                print("\n   - Outlier detection for time-to-fill:")
                q1 = closed_reqs['time_to_fill'].quantile(0.25)
                q3 = closed_reqs['time_to_fill'].quantile(0.75)
                iqr = q3 - q1
                lower_bound = max(0, q1 - 1.5 * iqr)  # Can't be negative
                upper_bound = q3 + 1.5 * iqr
                
                lower_outliers = closed_reqs[closed_reqs['time_to_fill'] < lower_bound]
                upper_outliers = closed_reqs[closed_reqs['time_to_fill'] > upper_bound]
                
                print(f"     Lower outliers (<{lower_bound:.1f} days): {len(lower_outliers)} ({len(lower_outliers)/len(closed_reqs)*100:.2f}%)")
                print(f"     Upper outliers (>{upper_bound:.1f} days): {len(upper_outliers)} ({len(upper_outliers)/len(closed_reqs)*100:.2f}%)")
                
                print("\n     Time-to-fill statistics (days):")
                display(closed_reqs['time_to_fill'].describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95]))

    elif table_name == 'candidate':
        # a. Check for candidates with status dates out of order
        date_cols = [col for col in df.columns if 'DATE' in col and col != 'LAST_MODIFIED_DATE']
        for i in range(len(date_cols)-1):
            for j in range(i+1, len(date_cols)):
                col1, col2 = date_cols[i], date_cols[j]
                invalid_dates = df[(df[col1].notna()) & (df[col2].notna()) & (df[col2] < df[col1])]
                if len(invalid_dates) > 0:
                    print(f"   - Records with {col2} before {col1}: {len(invalid_dates)}")
        
        # b. Check for missing candidate IDs
        if 'CANDIDATE_ID' in df.columns:
            missing_ids = df[df['CANDIDATE_ID'].isna()]
            print(f"   - Records with missing candidate IDs: {len(missing_ids)}")
            
        # c. Check for invalid statuses
        if 'CANDIDATE_HISTORICAL_STATUS' in df.columns and 'candidate_status' in data:
            valid_statuses = set(data['candidate_status']['CANDIDATE_HISTORICAL_STATUS'])
            invalid_statuses = df[~df['CANDIDATE_HISTORICAL_STATUS'].isin(valid_statuses)]
            print(f"   - Records with invalid status values: {len(invalid_statuses)}")
            if len(invalid_statuses) > 0:
                display(invalid_statuses['CANDIDATE_HISTORICAL_STATUS'].value_counts())

        # d. Add outlier detection for submission-to-interview time 
        if 'SUBMISSION_DATE' in df.columns and 'INTERVIEW_DATE' in df.columns:

            if df['SUBMISSION_DATE'].dtype != 'datetime64[ns]':
                df['SUBMISSION_DATE'] = pd.to_datetime(df['SUBMISSION_DATE'], errors='coerce')
            if df['INTERVIEW_DATE'].dtype != 'datetime64[ns]':
                df['INTERVIEW_DATE'] = pd.to_datetime(df['INTERVIEW_DATE'], errors='coerce')
            
            sub_to_int = df[(df['SUBMISSION_DATE'].notna()) & (df['INTERVIEW_DATE'].notna())].copy()
            if not sub_to_int.empty:
                sub_to_int['days_to_interview'] = (sub_to_int['INTERVIEW_DATE'] - sub_to_int['SUBMISSION_DATE']).dt.days
                sub_to_int = sub_to_int[sub_to_int['days_to_interview'] >= 0]  # Filter out negative values (errors)
                
                print("\n   - Outlier detection for submission-to-interview time:")
                q1 = sub_to_int['days_to_interview'].quantile(0.25)
                q3 = sub_to_int['days_to_interview'].quantile(0.75)
                iqr = q3 - q1
                upper_bound = q3 + 1.5 * iqr
                
                upper_outliers = sub_to_int[sub_to_int['days_to_interview'] > upper_bound]
                print(f"     Upper outliers (>{upper_bound:.1f} days): {len(upper_outliers)} ({len(upper_outliers)/len(sub_to_int)*100:.2f}%)")
                
                print("\n     Submission-to-interview time statistics (days):")
                display(sub_to_int['days_to_interview'].describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95]))

        # e. Add outlier detection for interview-to-hire time 
        if 'INTERVIEW_DATE' in df.columns and 'HIRED_DATE' in df.columns:
            # Convert to datetime if needed
            if df['INTERVIEW_DATE'].dtype != 'datetime64[ns]':
                df['INTERVIEW_DATE'] = pd.to_datetime(df['INTERVIEW_DATE'], errors='coerce')
            if df['HIRED_DATE'].dtype != 'datetime64[ns]':
                df['HIRED_DATE'] = pd.to_datetime(df['HIRED_DATE'], errors='coerce')
            
            int_to_hire = df[(df['INTERVIEW_DATE'].notna()) & (df['HIRED_DATE'].notna())].copy()
            if not int_to_hire.empty:
                int_to_hire['days_to_hire'] = (int_to_hire['HIRED_DATE'] - int_to_hire['INTERVIEW_DATE']).dt.days
                int_to_hire = int_to_hire[int_to_hire['days_to_hire'] >= 0]  # Filter out negative values (errors)
                
                print("\n   - Outlier detection for interview-to-hire time:")
                q1 = int_to_hire['days_to_hire'].quantile(0.25)
                q3 = int_to_hire['days_to_hire'].quantile(0.75)
                iqr = q3 - q1
                upper_bound = q3 + 1.5 * iqr
                
                upper_outliers = int_to_hire[int_to_hire['days_to_hire'] > upper_bound]
                print(f"     Upper outliers (>{upper_bound:.1f} days): {len(upper_outliers)} ({len(upper_outliers)/len(int_to_hire)*100:.2f}%)")
                
                print("\n     Interview-to-hire time statistics (days):")
                display(int_to_hire['days_to_hire'].describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95]))
    
    elif table_name == 'department':
        # a. Check for departments that are their own parent
        if 'DEPARTMENT_ID' in df.columns and 'PARENT_DEPARTMENT_ID' in df.columns:
            self_parent = df[df['DEPARTMENT_ID'] == df['PARENT_DEPARTMENT_ID']]
            print(f"   - Departments that are their own parent: {len(self_parent)}")
            if len(self_parent) > 0:
                display(self_parent.head())
            
        # b. Check for consistency in naming patterns
        if 'DEPARTMENT_NAME' in df.columns:
            missing_sd = df[~df['DEPARTMENT_NAME'].str.contains('- SD')]
            print(f"   - Departments without '- SD' in name: {len(missing_sd)}")
            if len(missing_sd) > 0:
                display(missing_sd['DEPARTMENT_NAME'].head())
                         
    elif table_name == 'candidate_status':
        # Check for duplicate status values
        if 'CANDIDATE_HISTORICAL_STATUS' in df.columns:
            dup_status = df['CANDIDATE_HISTORICAL_STATUS'].duplicated().sum()
            print(f"   - Duplicate status values: {dup_status}")
            
        # Check for missing stage mappings
        if 'CANDIDATE_STAGE' in df.columns:
            missing_stage = df[df['CANDIDATE_STAGE'].isna()]
            print(f"   - Statuses without stage mapping: {len(missing_stage)}")

In [17]:
# Each pipeline should have a logical order of candidate statuses 
# e.g. New Submission -> In Review -> Interview -> Offer -> Hired
# Or: New Submission -> In Review -> Rejected
# NOT: Interview -> New Submission -> Rejected -> Offer

def check_candidate_status_logic(df):
    print("\n" + "="*30 + " CANDIDATE STATUS LOGIC CHECKS " + "="*30)

    # First, ensure we have datetime for status dates
    if 'HISTORICAL_STATUS_START_DATE' in df.columns:
        if df['HISTORICAL_STATUS_START_DATE'].dtype != 'datetime64[ns]':
            df['HISTORICAL_STATUS_START_DATE'] = pd.to_datetime(df['HISTORICAL_STATUS_START_DATE'], errors='coerce')

    # Get active vs. closed requisitions
    active_reqs = set(data['requisitions'][data['requisitions']['STATUS_IN'] == 'Open']['REQUISITION_ID'])
    closed_reqs = set(data['requisitions'][data['requisitions']['STATUS_IN'] == 'Closed']['REQUISITION_ID'])    

    print(f"Active requisitions: {len(active_reqs)}")
    print(f"Closed requisitions: {len(closed_reqs)}")

    # Group by requisition and candidate to analyse each pipeline
    pipeline_groups = df.groupby(['REQUISITION_ID', 'CANDIDATE_ID'])

    # Get the last status for each pipeline
    last_statuses = pipeline_groups.apply(lambda g: g.sort_values('HISTORICAL_STATUS_START_DATE').iloc[-1])

    # Add flag for active vs. closed requisitions
    last_statuses['REQUISITION_STATUS'] = last_statuses['REQUISITION_ID'].apply(
        lambda x: 'Active' if x in active_reqs else 'Closed' if x in closed_reqs else 'Unknown'
    )

    # Check distribution of final statuses
    print("\nDistribution of final candidate statuses:")
    final_status_counts = last_statuses['CANDIDATE_HISTORICAL_STATUS'].value_counts()
    display(final_status_counts)

    # Identify potentially problematic final statuses for CLOSED requisitions only
    expected_final_statuses = ['Hired', 'Rejected', 'Closed']
    closed_req_pipelines = last_statuses[last_statuses['REQUISITION_STATUS'] == 'Closed']
    
    unexpected_final = closed_req_pipelines[~closed_req_pipelines['CANDIDATE_HISTORICAL_STATUS'].isin(expected_final_statuses)]
    
    print(f"\nClosed requisition pipelines not ending with expected final status (Hired/Rejected/Closed): {len(unexpected_final)} ({len(unexpected_final)/len(closed_req_pipelines)*100:.2f}%)")
    
    if len(unexpected_final) > 0:
        print("\nTop unusual final statuses for closed requisitions:")
        display(unexpected_final['CANDIDATE_HISTORICAL_STATUS'].value_counts().head(10))
        
        print("\nSample of closed requisition pipelines with unusual final status:")
        display(unexpected_final.head())

    # Check for logical progression in status
    print("\nChecking for status sequence anomalies...")
    
    # Define a simplified expected progression
    early_stages = ['New Submission', 'In Review']
    mid_stages = ['Interview', 'First Interview', 'Second Interview', 'Final Interview']
    late_stages = ['Offer', 'Hired', 'Rejected', 'Closed']
    
    # Function to check if a sequence has logical progression issues
    def has_sequence_issue(group):
        # Sort by status date
        sequence = group.sort_values('HISTORICAL_STATUS_START_DATE')
        
        # Check if late stages come before early stages
        has_issue = False
        
        # If any late stage exists before an early stage
        for late_idx, row in sequence[sequence['CANDIDATE_HISTORICAL_STATUS'].isin(late_stages)].iterrows():
            late_date = row['HISTORICAL_STATUS_START_DATE']
            early_after = sequence[(sequence['HISTORICAL_STATUS_START_DATE'] > late_date) & 
                                  (sequence['CANDIDATE_HISTORICAL_STATUS'].isin(early_stages))]
            if not early_after.empty:
                has_issue = True
                break
                
        return has_issue

    # Apply check to each pipeline
    problematic_pipelines = pipeline_groups.apply(has_sequence_issue)
    problem_count = problematic_pipelines.sum()
    
    print(f"\nPipelines with illogical status sequences: {problem_count} ({problem_count/len(pipeline_groups)*100:.2f}%)")
    
    if problem_count > 0:
        print("\nSample problematic pipelines:")
        sample_problems = problematic_pipelines[problematic_pipelines].index[:5]
        
        for req_id, cand_id in sample_problems:
            print(f"\nRequisition {req_id}, Candidate {cand_id}:")
            seq = df[(df['REQUISITION_ID'] == req_id) & (df['CANDIDATE_ID'] == cand_id)]
            display(seq[['CANDIDATE_HISTORICAL_STATUS', 'HISTORICAL_STATUS_START_DATE']].sort_values('HISTORICAL_STATUS_START_DATE'))
    
    return {
        'final_status_counts': final_status_counts,
        'unexpected_final_count_closed_reqs': len(unexpected_final),
        'problematic_sequence_count': problem_count
    }

In [19]:
# Assessing requisitions table

requisitions_quality = assess_data_quality(data['requisitions'], 'requisitions')



1. Duplicates: 0 (0.00%)

2. Missing Values:


Unnamed: 0,Missing Values,Percentage
RECRUITER,1540,30.646766
CLOSE_DATE,296,5.890547



3. Data Types:


Unnamed: 0,Data Type
REQUISITION_ID,int64
REQUISITION_UID,int64
STATUS_IN,object
OPEN_DATE,datetime64[ns]
CLOSE_DATE,datetime64[ns]
NUMBER_OF_OPENINGS,int64
DEPARTMENT_ID,int64
DEPARTMENT_NAME,object
RECRUITER_ID,int64
RECRUITER,object



4. Specific Quality Checks:
   - Requisitions with close date before open date: 0
   - Requisitions with more than 10 openings: 25


NUMBER_OF_OPENINGS
1     4028
2      605
3      186
4       82
5       43
6       24
7        7
8       11
9        1
10      13
11       1
12       2
13       2
14       2
15       9
18       1
20       2
22       1
23       4
30       1
Name: count, dtype: int64

In [20]:
# Assessing candidate table


candidate_quality = assess_data_quality(data['candidate'], 'candidate')
candidate_status_logic = check_candidate_status_logic(data['candidate'])



1. Duplicates: 0 (0.00%)

2. Missing Values:


Unnamed: 0,Missing Values,Percentage
HISTORICAL_STATUS_END_DATE,168230,27.323061
CANDIDATE_ID,17,0.002761



3. Data Types:


Unnamed: 0,Data Type
REQUISITION_ID,int64
PIPELINE_ID,int64
SUBMISSION_DATE,object
CANDIDATE_ID,float64
SUBMISSION_SOURCE,object
CANDIDATE_HISTORICAL_STATUS,object
HISTORICAL_STATUS_START_DATE,datetime64[ns]
HISTORICAL_STATUS_END_DATE,datetime64[ns]
LAST_MODIFIED_DATE,datetime64[ns]



4. Specific Quality Checks:


TypeError: '<' not supported between instances of 'float' and 'str'

In [None]:
# Assessing candidate status table

candidate_status_quality = assess_data_quality(data['candidate_status'], 'candidate_status')

In [None]:
# Assessing department table

department_quality = assess_data_quality(data['department'], 'department')