# Initial Data Exploration

In [1]:
# necessary to import db_connector script
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add project root to sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from db_connector import load_from_excel

In [3]:
# Set display options for better visibility
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [4]:
# Load the data
data = load_from_excel()

In [5]:
# Examine each dataset
for name, df in data.items():
    print(f"\n\n{'='*50}")
    print(f" {name.upper()} DATASET ")
    print(f"{'='*50}")
    
    # Basic information
    print(f"\nShape: {df.shape}")
    print("\nFirst 5 rows:")
    display(df.head())
    
    # Column information
    print("\nColumn information:")
    for col in df.columns:
        non_null = df[col].count()
        dtype = df[col].dtype
        unique = df[col].nunique()
        print(f"- {col}: {non_null} non-null values, {dtype}, {unique} unique values")
    
    # Summary statistics for numeric columns
    if df.select_dtypes(include=['number']).shape[1] > 0:
        print("\nNumeric column statistics:")
        display(df.describe())



 REQUISITIONS DATASET 

Shape: (5025, 11)

First 5 rows:


Unnamed: 0,REQUISITION_ID,REQUISITION_UID,STATUS_IN,OPEN_DATE,CLOSE_DATE,NUMBER_OF_OPENINGS,DEPARTMENT_ID,DEPARTMENT_NAME,RECRUITER_ID,RECRUITER,LAST_MODIFIED_DATE
0,223060,131427,Closed,2022-10-11 14:20:49,2023-01-03 16:11:06,8,43100,1479 - SD,2921787,M.I,2023-01-03
1,225225,131002,Closed,2022-10-07 16:09:56,2023-01-03 09:24:23,3,42318,1401 - SD,2734691,I.R,2023-01-03
2,300767,137496,Closed,2022-11-22 10:57:48,2023-01-03 12:53:07,1,41891,0380 - SD,2020433,B.S,2023-01-03
3,300803,137570,Closed,2022-11-22 15:07:08,2023-01-03 09:21:54,6,42845,1810 - SD,2734691,I.R,2023-01-03
4,306549,137881,Closed,2022-11-25 16:49:05,2023-01-03 09:25:43,2,41687,1329 - SD,2734691,I.R,2023-01-03



Column information:
- REQUISITION_ID: 5025 non-null values, int64, 4853 unique values
- REQUISITION_UID: 5025 non-null values, int64, 4853 unique values
- STATUS_IN: 5025 non-null values, object, 2 unique values
- OPEN_DATE: 5025 non-null values, datetime64[ns], 4852 unique values
- CLOSE_DATE: 4729 non-null values, datetime64[ns], 4729 unique values
- NUMBER_OF_OPENINGS: 5025 non-null values, int64, 20 unique values
- DEPARTMENT_ID: 5025 non-null values, int64, 315 unique values
- DEPARTMENT_NAME: 5025 non-null values, object, 314 unique values
- RECRUITER_ID: 5025 non-null values, int64, 37 unique values
- RECRUITER: 3485 non-null values, object, 35 unique values
- LAST_MODIFIED_DATE: 5025 non-null values, datetime64[ns], 619 unique values

Numeric column statistics:


Unnamed: 0,REQUISITION_ID,REQUISITION_UID,OPEN_DATE,CLOSE_DATE,NUMBER_OF_OPENINGS,DEPARTMENT_ID,RECRUITER_ID,LAST_MODIFIED_DATE
count,5025.0,5025.0,5025,4729,5025.0,5025.0,5025.0,5025
mean,882313.7,185378.702289,2024-01-13 01:50:32.395422976,2024-01-22 02:26:45.913723904,1.429851,45876.489154,2183736.0,2024-02-11 07:39:39.223880448
min,136083.0,28247.0,2020-07-29 09:07:59,2023-01-03 09:11:38,1.0,40693.0,1601769.0,2023-01-03 00:00:00
25%,543332.0,162998.0,2023-06-21 10:10:37,2023-07-13 08:31:22,1.0,41829.0,1613171.0,2023-07-24 00:00:00
50%,1002463.0,186604.0,2024-01-08 15:15:47,2023-12-15 14:02:32,1.0,42049.0,1622345.0,2024-02-13 00:00:00
75%,1245812.0,210208.0,2024-08-20 14:36:08,2024-08-06 15:17:50,1.0,42366.0,2699752.0,2024-09-13 00:00:00
max,1413645.0,225961.0,2025-02-14 15:51:29,2025-02-14 21:14:36,30.0,110251.0,8811611.0,2025-02-20 00:00:00
std,356719.5,26987.396638,,,1.500764,14026.787975,986492.1,




 CANDIDATE DATASET 

Shape: (615707, 9)

First 5 rows:


Unnamed: 0,REQUISITION_ID,PIPELINE_ID,SUBMISSION_DATE,CANDIDATE_ID,SUBMISSION_SOURCE,CANDIDATE_HISTORICAL_STATUS,HISTORICAL_STATUS_START_DATE,HISTORICAL_STATUS_END_DATE,LAST_MODIFIED_DATE
0,278292,5538745,2023-01-01 18:26:13,4524912.0,URL_p_Indeed Organic,New Submission,2023-01-01 18:26:13,2023-01-06 11:51:51,2023-01-06
1,278292,5538745,2023-01-01 18:26:13,4524912.0,URL_p_Indeed Organic,Rejected,2023-01-06 11:51:51,2023-01-06 11:52:30,2023-01-06
2,278292,5538745,2023-01-01 18:26:13,4524912.0,URL_p_Indeed Organic,Pre Offer xxx,2023-01-06 11:51:51,2023-01-06 11:52:30,2023-01-06
3,278292,5538745,2023-01-01 18:26:13,4524912.0,URL_p_Indeed Organic,Closed,2023-01-06 11:52:30,NaT,2023-01-06
4,291608,5538752,2023-01-01 14:58:13,4524783.0,URL_p_Indeed Organic,New Submission,2023-01-01 14:58:13,2023-01-04 09:34:41,2023-01-04



Column information:
- REQUISITION_ID: 615707 non-null values, int64, 4624 unique values
- PIPELINE_ID: 615707 non-null values, int64, 168569 unique values
- SUBMISSION_DATE: 615707 non-null values, object, 166913 unique values
- CANDIDATE_ID: 615690 non-null values, float64, 129677 unique values
- SUBMISSION_SOURCE: 615707 non-null values, object, 132 unique values
- CANDIDATE_HISTORICAL_STATUS: 615707 non-null values, object, 17 unique values
- HISTORICAL_STATUS_START_DATE: 615707 non-null values, datetime64[ns], 281276 unique values
- HISTORICAL_STATUS_END_DATE: 447477 non-null values, datetime64[ns], 115255 unique values
- LAST_MODIFIED_DATE: 615707 non-null values, datetime64[ns], 769 unique values

Numeric column statistics:


Unnamed: 0,REQUISITION_ID,PIPELINE_ID,CANDIDATE_ID,HISTORICAL_STATUS_START_DATE,HISTORICAL_STATUS_END_DATE,LAST_MODIFIED_DATE
count,615707.0,615707.0,615690.0,615707,447477,615707
mean,900381.2,13394840.0,10365720.0,2024-02-18 23:24:13.851199488,2024-02-23 09:36:13.571237632,2024-03-03 01:17:21.017887232
min,136083.0,5538745.0,1697052.0,2023-01-01 01:24:14,2023-01-02 20:11:44,2023-01-02 00:00:00
25%,562113.0,9503073.0,6641383.0,2023-08-04 09:14:12,2023-08-11 10:06:42,2023-08-17 00:00:00
50%,1032645.0,13674030.0,9647169.0,2024-03-18 08:56:12,2024-03-18 08:56:34,2024-03-29 00:00:00
75%,1234086.0,17793800.0,15290340.0,2024-08-24 11:27:46,2024-08-27 15:51:25,2024-09-17 00:00:00
max,1413645.0,19023160.0,18363690.0,2025-02-15 23:58:46,2025-02-15 23:33:42,2025-02-15 00:00:00
std,340876.0,4309340.0,4505447.0,,,




 CANDIDATE_STATUS DATASET 

Shape: (16, 2)

First 5 rows:


Unnamed: 0,CANDIDATE_HISTORICAL_STATUS,CANDIDATE_STAGE
0,New Submission,New Submission
1,Rejected,Rejected
2,Closed,Closed
3,In Review,In Review
4,Second Interview,Interview



Column information:
- CANDIDATE_HISTORICAL_STATUS: 16 non-null values, object, 16 unique values
- CANDIDATE_STAGE: 16 non-null values, object, 7 unique values


 DEPARTMENT DATASET 

Shape: (392, 4)

First 5 rows:


Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,PARENT_DEPARTMENT_ID,PARENT_DEPARTMENT_NAME
0,40847,0717 - SD,40696,Area 16 - SD
1,40849,0720 - SD,40696,Area 16 - SD
2,40851,0724 - SD,40694,Area 14 - SD
3,40852,0725 - SD,40700,Area 20 - SD
4,40855,0729 - SD,40704,Area 24 - SD



Column information:
- DEPARTMENT_ID: 392 non-null values, int64, 392 unique values
- DEPARTMENT_NAME: 392 non-null values, object, 387 unique values
- PARENT_DEPARTMENT_ID: 392 non-null values, int64, 22 unique values
- PARENT_DEPARTMENT_NAME: 392 non-null values, object, 21 unique values

Numeric column statistics:


Unnamed: 0,DEPARTMENT_ID,PARENT_DEPARTMENT_ID
count,392.0,392.0
mean,45894.640306,40754.857143
std,14962.637341,184.298116
min,40690.0,40653.0
25%,41755.25,40694.0
50%,41979.5,40700.0
75%,42281.25,40705.0
max,110253.0,41423.0
