# SageMaker Demo: Employee Attrition Prediction Using Feature Store and XGBoost

This notebook demonstrates how to use Amazon SageMaker's Feature Store and XGBoost built-in algorithm to predict employee attrition.

In [1]:
# Step 1: Setup
import sagemaker
from sagemaker import get_execution_role
import boto3
import pandas as pd

sagemaker_session = sagemaker.Session()
role = get_execution_role()
region = boto3.Session().region_name

# S3 bucket for storing data
bucket = 'sagemaker-ml-28573'


# Load the dataset
file_path = 'Employee.csv'  # Replace with your actual file path in S3 if needed
employee_df = pd.read_csv(file_path)
employee_df.head()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [3]:
# Step 2: Data Preparation
# Convert categorical columns to numeric
employee_df['Education'] = employee_df['Education'].astype('category').cat.codes
employee_df['City'] = employee_df['City'].astype('category').cat.codes
employee_df['Gender'] = employee_df['Gender'].astype('category').cat.codes
employee_df['EverBenched'] = employee_df['EverBenched'].map({'Yes': 1, 'No': 0})

# Drop rows with NaN values in the target column
employee_df.dropna(subset=['LeaveOrNot'])

# Convert target column to numeric if needed
employee_df['LeaveOrNot'] = employee_df['LeaveOrNot'].astype(int)

# Ensure no missing values in feature columns
employee_df = employee_df.dropna()

# Verify all columns are numeric
print(employee_df.dtypes)

# Define features and target
feature_columns = [
    'Education', 'JoiningYear', 'City', 'PaymentTier', 'Age',
    'Gender', 'EverBenched', 'ExperienceInCurrentDomain'
]
target_column = 'LeaveOrNot'

employee_df = employee_df[[target_column] + feature_columns]



# Display the transformed dataset
employee_df.head()

Unnamed: 0,LeaveOrNot,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain
0,0,0,2017,0,3,34,1,0,0
1,1,0,2013,2,1,28,0,0,3
2,0,0,2014,1,3,38,0,0,2
3,1,1,2016,0,3,27,1,0,5
4,1,1,2017,2,3,24,1,1,2


In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup
from time import gmtime, strftime

# Create a Feature Group
feature_group_name = 'employee-feature-group-' + strftime('%Y%m%d%H%M%S', gmtime())
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)

# Define the schema
record_identifier_name = 'EmployeeID'  # Unique identifier for records
event_time_feature_name = 'EventTime'  # Column representing the time of event

# Ensure EventTime is in the correct ISO-8601 format
employee_df[event_time_feature_name] = pd.to_datetime('now').strftime('%Y-%m-%dT%H:%M:%S.%fZ')
employee_df[record_identifier_name] = employee_df.index

# Load features to the Feature Store
feature_group.load_feature_definitions(data_frame=employee_df)

# Enable the Online Store when creating the Feature Group
feature_group.create(
    s3_uri='s3://sagemaker-ml-28573/features',
    record_identifier_name=record_identifier_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True  # Enable the Online Store
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:448049810900:feature-group/employee-feature-group-20240829131409',
 'ResponseMetadata': {'RequestId': 'eab083de-aeee-4f25-adf5-9b26e0f5d24d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'eab083de-aeee-4f25-adf5-9b26e0f5d24d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '114',
   'date': 'Thu, 29 Aug 2024 13:14:09 GMT'},
  'RetryAttempts': 0}}

In [10]:
# Check the status of the Feature Group
status = feature_group.describe().get("FeatureGroupStatus")
print(f"Feature Group Status: {status}")

if status == "Created":
    print("Feature Group is Created and ready for use. Proceeding with ingestion...")
    
    # Ingest data into the Feature Store
    feature_group.ingest(data_frame=employee_df, max_workers=3, wait=True)
    print('Data ingested into Feature Store.')

Feature Group Status: Created
Feature Group is Created and ready for use. Proceeding with ingestion...
Data ingested into Feature Store.


In [12]:
from sagemaker.feature_store.feature_group import FeatureGroup
from sklearn.model_selection import train_test_split 

# Initialize the SageMaker Feature Store runtime client
featurestore_runtime = boto3.client('sagemaker-featurestore-runtime')

# Define the feature group name and features you want to retrieve
feature_names = ['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender', 'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot']

# Retrieve records and convert to DataFrame
records = []
for record_id in employee_df.index.astype(str):
    response = featurestore_runtime.get_record(
        FeatureGroupName=feature_group_name,
        RecordIdentifierValueAsString=str(record_id),
        FeatureNames=feature_names
    )
    # Check if 'Record' is in the response and add to records list
    if 'Record' in response:
        record = {feature['FeatureName']: feature['ValueAsString'] for feature in response['Record']}
        records.append(record)
    else:
        print(f"Record with ID {record_id} not found.")

# Convert the list of records to a DataFrame
retrieved_df = pd.DataFrame(records)

# Check if we have any retrieved records
if not retrieved_df.empty:
    # Split the data into training and test sets
    train_df, test_df = train_test_split(retrieved_df, test_size=0.2, random_state=42)
    print("Training and test data split after retrieval from Feature Store.")
else:
    print("No records retrieved. Please check the feature group and identifiers.")

Training and test data split after retrieval from Feature Store.


## Train the Model Using Local Data with S3 Mode (Default)

In [13]:
# Initialize S3 client
s3 = boto3.client('s3')

# Define your S3 bucket and prefix
bucket = 'sagemaker-ml-28573'
prefix = 'input-data'

# Save the data locally first
train_file = 'train.csv'
validation_file = 'validation.csv'
train_df.to_csv(train_file, index=False)
test_df.to_csv(validation_file, index=False)

# Upload the data to S3
s3.upload_file(train_file, bucket, f'{prefix}/train/{train_file}')
s3.upload_file(validation_file, bucket, f'{prefix}/validation/{validation_file}')

print(f"Training data uploaded to s3://{bucket}/{prefix}/train/{train_file}")
print(f"Validation data uploaded to s3://{bucket}/{prefix}/validation/{validation_file}")


Training data uploaded to s3://sagemaker-ml-28573/input-data/train/train.csv
Validation data uploaded to s3://sagemaker-ml-28573/input-data/validation/validation.csv


In [14]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput

# Initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"50"}

# Set an output path where the trained model will be saved
bucket = 'sagemaker-ml-28573'
prefix = 'demo-built-in-algorithm'
output_path = f's3://{bucket}/{prefix}/output'

# Retrieve the XGBoost image URI
region = boto3.Session().region_name  # Automatically get the region
xgboost_container = image_uris.retrieve("xgboost", region, "1.7-1")

# Construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.xlarge', 
                                          volume_size=5,  # 5 GB 
                                          output_path=output_path)

# Define the data type and paths to the training and validation datasets
content_type = "csv"
train_input = TrainingInput(f"s3://{bucket}/input-data/train/", content_type=content_type)
validation_input = TrainingInput(f"s3://{bucket}/input-data/validation/", content_type=content_type)

# Execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-08-29-13-30-18-037


2024-08-29 13:30:20 Starting - Starting the training job...
2024-08-29 13:30:35 Starting - Preparing the instances for training...
2024-08-29 13:31:02 Downloading - Downloading input data...
2024-08-29 13:31:27 Downloading - Downloading the training image...
2024-08-29 13:32:18 Training - Training image download completed. Training in progress...[34m[2024-08-29 13:32:29.133 ip-10-0-204-133.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-08-29 13:32:29.156 ip-10-0-204-133.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-08-29:13:32:29:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-08-29:13:32:29:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-08-29:13:32:29:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-08-29:13:32:29:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34