# Human Capital Analytics and Reporting Case Study

In [17]:
# Import packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from datetime import date

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [3]:
# Set display options
pd.set_option('display.max_rows', 150)

## Data Analysis

In [4]:
# Read in the data

# Create dataframe variables 
ra = pd.read_csv('Case_Study_HCAR_Research_2019_Recruiting_Activity_Data.csv')    # Recruiting Activity Dataframe
os = pd.read_csv('Case_Study_HCAR_Research_2019_Offer_Status_Data.csv')           # Offer Status Dataframe

In [5]:
# Check the dimensions of the datasets
# Output is in the form (rows, columns)
print(ra.shape)
print('')
print(os.shape)

(4972, 9)

(125, 3)


### Recruiting Activity Dataframe

In [6]:
# Print information on the Recruiting Activity dataset
ra.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4972 entries, 0 to 4971
Data columns (total 9 columns):
Candidate ID Number                  4959 non-null float64
Position Title                       4959 non-null object
Department                           4959 non-null object
Furthest Recruiting Stage Reached    4959 non-null object
Date of Application                  4959 non-null object
Application Source                   4959 non-null object
Highest Degree                       4959 non-null object
Years of Experience                  4959 non-null float64
Candidate Type                       4959 non-null object
dtypes: float64(2), object(7)
memory usage: 349.7+ KB


**There are null values in the dataset because the number of entries in each column doesn't match the total number of entries in the RangeIndex.**

In [7]:
# Check for null values
ra.isna().sum()

Candidate ID Number                  13
Position Title                       13
Department                           13
Furthest Recruiting Stage Reached    13
Date of Application                  13
Application Source                   13
Highest Degree                       13
Years of Experience                  13
Candidate Type                       13
dtype: int64

**There are 13 null values in each column.**

In [9]:
# Filter dataframe for all null values to examine them and determine what should be done with them
ra[ra.isna().any(axis=1)]

Unnamed: 0,Candidate ID Number,Position Title,Department,Furthest Recruiting Stage Reached,Date of Application,Application Source,Highest Degree,Years of Experience,Candidate Type
4959,,,,,,,,,
4960,,,,,,,,,
4961,,,,,,,,,
4962,,,,,,,,,
4963,,,,,,,,,
4964,,,,,,,,,
4965,,,,,,,,,
4966,,,,,,,,,
4967,,,,,,,,,
4968,,,,,,,,,


**Since the rows with null values dont contain any information they will be dropped from the dataframe.**

In [10]:
# Drop null values
ra.dropna(inplace=True)

In [11]:
# Check to see if all null values have been dropped
ra.isna().sum().sum()

0

In [16]:
# Analyze all columns except Candidate ID Number and Date of Application to see unique values in each column 
[print(ra[i].unique()) for i in ra.columns.drop(['Candidate ID Number', 'Date of Application'])]

['Associate Software Developer' 'Associate Relationship Manager'
 'Associate Product Manager' 'Finance Manager' 'Operations Coordinator'
 'Business Operations Manager' 'Sr. Business Analyst'
 'Sr. Product Manager' 'IT Analyst' 'Sr. Software Engineer'
 'Sr. Customer Service Operations Associate' 'UX Designer'
 'Financial Analyst' 'Account Executive' 'Operations Generalist']
['Engineering' 'Sales' 'Product' 'Finance' 'Operations' 'IT']
['Offer Sent' 'New Application' 'In-House Interview' 'Phone Screen']
['Campus Job Board' 'Campus Event' 'Advertisement' 'Career Fair' 'Website'
 'Agency' 'Outsourced' 'Internal Referral']
['PhD' 'Masters' 'Bachelors' 'JD' 'Phd']
[ 2.  0.  1.  3.  5.  7.  4. 15.  6. 13.  8. 10. 14. 12. 11.  9.]
['Campus' 'Experienced']


[None, None, None, None, None, None, None]

### Offer Status DataFrame

In [12]:
# Print information on the Offer Status dataset
os.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 3 columns):
Candidate ID Number    125 non-null int64
Offer Date             125 non-null object
Offer Decision         125 non-null object
dtypes: int64(1), object(2)
memory usage: 3.0+ KB


**There are no null values in this dataset.**

In [17]:
pd.merge(left=ra, right=os, how='outer')

Unnamed: 0,Candidate ID Number,Position Title,Department,Furthest Recruiting Stage Reached,Date of Application,Application Source,Highest Degree,Years of Experience,Candidate Type,Offer Date,Offer Decision
0,2468.0,Associate Software Developer,Engineering,Offer Sent,11/1/18,Campus Job Board,PhD,2.0,Campus,12/15/18,Offer Response Pending
1,2471.0,Associate Relationship Manager,Sales,Offer Sent,12/13/18,Campus Event,Masters,0.0,Campus,1/31/19,Offer Response Pending
2,2475.0,Associate Software Developer,Engineering,Offer Sent,12/21/18,Advertisement,Masters,0.0,Campus,2/5/19,Offer Response Pending
3,2480.0,Associate Relationship Manager,Sales,Offer Sent,12/25/18,Campus Event,Masters,0.0,Campus,2/9/19,Offer Response Pending
4,2486.0,Associate Relationship Manager,Sales,Offer Sent,12/6/18,Campus Event,Masters,1.0,Campus,1/17/19,Offer Response Pending
5,2493.0,Associate Software Developer,Engineering,Offer Sent,12/11/18,Campus Event,PhD,1.0,Campus,1/26/19,Offer Response Pending
6,2501.0,Associate Software Developer,Engineering,Offer Sent,12/13/18,Campus Job Board,Bachelors,0.0,Campus,1/26/19,Offer Response Pending
7,2510.0,Associate Relationship Manager,Sales,Offer Sent,12/13/18,Career Fair,Masters,0.0,Campus,1/28/19,Offer Response Pending
8,2520.0,Associate Software Developer,Engineering,Offer Sent,11/23/18,Career Fair,Bachelors,0.0,Campus,1/5/19,Offer Response Pending
9,2531.0,Associate Software Developer,Engineering,Offer Sent,1/2/19,Campus Event,PhD,1.0,Campus,2/15/19,Offer Response Pending
