# Employee Attrition Rate Prediction using SageMaker XGBoost
This notebook demonstrates how to:
- Generate sample employee data
- Perform feature engineering
- Train an XGBoost model using SageMaker built-in algorithm
- Deploy the model and test predictions

## Step 1: Import Required Libraries
### This Cell contains TODO 1, please ensure to complete it.
This cell imports necessary Python libraries.

In [None]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Initialize SageMaker session
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
session = sagemaker.Session()
#bucket = session.default_bucket()
bucket = f"jam-sm-sagemaker-bucket-{account_id}"
prefix = 'employee-attrition'

# For demonstration, we'll create synthetic data
def generate_synthetic_data(n_samples=1000):
    np.random.seed(42)
    data = {
        'Age': np.random.randint(22, 60, n_samples),
        'YearsAtCompany': np.random.randint(0, 20, n_samples),
        'Salary': np.random.randint(30000, 150000, n_samples),
        'JobSatisfaction': np.random.randint(1, 5, n_samples),
        'Department': np.random.choice(['HR', 'Sales', 'Engineering', 'Finance'], n_samples),
        'OverTime': np.random.choice(['Yes', 'No'], n_samples),
        'WorkLifeBalance': np.random.randint(1, 5, n_samples),
        'PerformanceRating': np.random.randint(1, 5, n_samples),
        'Attrition': np.random.choice(['Yes', 'No'], n_samples, p=[0.16, 0.84])  # 16% attrition rate
    }
    return pd.DataFrame(data)

# Generate and display sample data

# TODO 1
df = # Your code here

### Step 1.1: Display top 10 records from the sample data

In [None]:
df.head(10)

## Step 2: Data Analysis & Visualization

In [None]:
# Basic statistics
print(df.describe())

# Check class distribution
attrition_counts = df['Attrition'].value_counts()
print(f"Attrition distribution:\n{attrition_counts}")
print(f"Attrition rate: {attrition_counts['Yes'] / len(df):.2%}")

# Visualize key relationships
plt.figure(figsize=(12, 8))

# Age vs Attrition
plt.subplot(2, 2, 1)
sns.boxplot(x='Attrition', y='Age', data=df)
plt.title('Age vs Attrition')

# Job Satisfaction vs Attrition
plt.subplot(2, 2, 2)
sns.countplot(x='JobSatisfaction', hue='Attrition', data=df)
plt.title('Job Satisfaction vs Attrition')

# Department vs Attrition
plt.subplot(2, 2, 3)
dept_attrition = pd.crosstab(df['Department'], df['Attrition'], normalize='index')
dept_attrition['Yes'].sort_values().plot(kind='bar')
plt.title('Attrition Rate by Department')
plt.ylabel('Attrition Rate')

# Overtime vs Attrition
plt.subplot(2, 2, 4)
overtime_attrition = pd.crosstab(df['OverTime'], df['Attrition'], normalize='index')
overtime_attrition['Yes'].plot(kind='bar')
plt.title('Attrition Rate by Overtime Status')
plt.ylabel('Attrition Rate')

plt.tight_layout()
plt.show()


## Step 3: Feature Engineering
### This Cell perfoms feature engineering and splits the data into Training and Test Dataset.
### Feature Engineered Data would be uploaded to S3 Bucket.
### This Cell contains TODO 2, please ensure to complete it.

In [None]:
# Prepare features and target
X = df.drop('Attrition', axis=1)
y = (df['Attrition'] == 'Yes').astype(int)  # Convert to binary

# Split data
# TODO 2
#Split the data into 80% Training and 20% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, '''YOUR CODE HERE''', random_state=42, stratify=y)

# Define preprocessing for numerical and categorical features
numerical_features = ['Age', 'YearsAtCompany', 'Salary', 'JobSatisfaction', 'WorkLifeBalance', 'PerformanceRating']
categorical_features = ['Department', 'OverTime']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Preprocess the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Convert to numpy arrays
train_features = X_train_processed.toarray() if hasattr(X_train_processed, 'toarray') else X_train_processed
test_features = X_test_processed.toarray() if hasattr(X_test_processed, 'toarray') else X_test_processed

print(f"Training data shape: {train_features.shape}")
print(f"Testing data shape: {test_features.shape}")

# Save the preprocessor for later use with new data
import joblib
joblib.dump(preprocessor, 'preprocessor.joblib')

# Format data for SageMaker XGBoost (CSV format with label in first column)
def format_for_xgboost(features, labels):
    """Format features and labels into CSV format for XGBoost."""
    if labels is not None:
        # For training/validation: label in first column
        data = np.hstack((labels.reshape(-1, 1), features))
    else:
        # For prediction: features only
        data = features
    return data

# Format training and test data
train_data = format_for_xgboost(train_features, y_train.values)
test_data = format_for_xgboost(test_features, y_test.values)

# Upload to S3
def upload_to_s3(data, bucket, prefix, filename):
    """Upload data to S3 in CSV format."""
    csv_buffer = io.StringIO()
    np.savetxt(csv_buffer, data, delimiter=',')
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket, f"{prefix}/{filename}").put(Body=csv_buffer.getvalue())
    return f"s3://{bucket}/{prefix}/{filename}"

# Upload data to S3
train_s3_path = upload_to_s3(train_data, bucket, prefix, 'train.csv')
test_s3_path = upload_to_s3(test_data, bucket, prefix, 'test.csv')

print(f"Training data uploaded to: {train_s3_path}")
print(f"Test data uploaded to: {test_s3_path}")


## Step 4: Model Training
### This cell would create Estimator object. We would use XGBoost Algorithm for this Task. XGBoost is an Extreme Gradient Boosting Algorithm used for Classification and Regression Problems.
### This cell also sets the hyperparameters and defines the input and output data channels.

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator

# Get the container image for XGBoost
container = sagemaker.image_uris.retrieve('xgboost', session.boto_region_name, '1.5-1')

# Set up the estimator
xgb = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=session
)

# Set hyperparameters
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective='binary:logistic',
    num_round=100,
    verbosity=1
)

# Define data channels
train_input = sagemaker.inputs.TrainingInput(
    train_s3_path,
    content_type='text/csv',
    distribution='FullyReplicated'
)

validation_input = sagemaker.inputs.TrainingInput(
    test_s3_path,
    content_type='text/csv',
    distribution='FullyReplicated'
)

data_channels = {
    'train': train_input,
    'validation': validation_input
}

## Step 4.1: Model Training
### Start the Model Training Job
### This Cell contains TODO 3, please ensure to complete it.

In [None]:
# Train the model
# TODO 3 - Start the training job
# YOUR CODE HERE


## Step 5: Deploy Model
### After the model gets trained, it gets deployed for making predictions, we are going to deploy the model as a real-time endpoint.

In [None]:
# Deploy the model to an endpoint
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    serializer=sagemaker.serializers.CSVSerializer()
)

## Step 5.1: Intialize a variable with the endpoint name, so it can be used for making inference.

In [None]:
# Get the endpoint name
endpoint_name = xgb_predictor.endpoint_name
print(f"Endpoint Name: {endpoint_name}")

## Step 5.2: Create a method to make predictions on the deployed endpoint. We are using SageMaker Python SDK for creating this method.

In [None]:
# Method 1: Using the SageMaker Python SDK
def predict_with_sagemaker_sdk(data, predictor):
    """Make predictions using the SageMaker Python SDK."""
    # Ensure data is in the right format for XGBoost built-in algorithm
    if isinstance(data, pd.DataFrame):
        # Preprocess the data
        preprocessor = joblib.load('preprocessor.joblib')
        processed_data = preprocessor.transform(data)
        if hasattr(processed_data, 'toarray'):
            processed_data = processed_data.toarray()
        
        # Convert to CSV format - XGBoost built-in algorithm expects CSV with no header
        csv_data = io.StringIO()
        np.savetxt(csv_data, processed_data, delimiter=',')
        
        # Make prediction
        response = predictor.predict(csv_data.getvalue())
        
        # Parse the response - built-in XGBoost may return multiple lines
        if isinstance(response, bytes):
            response_str = response.decode('utf-8')
        else:
            response_str = response
            
        # Split by lines and parse each line as JSON
        predictions = []
        for line in response_str.strip().split('\n'):
            if line:  # Skip empty lines
                predictions.append(float(line))  # XGBoost returns one float per line
                
        return np.array(predictions)
    else:
        raise ValueError("Input must be a pandas DataFrame")
            
    # Convert to CSV string - XGBoost built-in algorithm expects CSV with no header
    csv_buffer = io.StringIO()
    np.savetxt(csv_buffer, processed_data, delimiter=',')
    csv_data = csv_buffer.getvalue()
    
    
    # Create a boto3 client
    runtime_client = boto3.client('sagemaker-runtime')
    
    # Make the prediction
    response = runtime_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='text/csv',
        Body=csv_data
    )
    
    # Parse the response - built-in XGBoost may return multiple lines
    response_str = response['Body'].read().decode('utf-8')
    
    # Split by lines and parse each line
    predictions = []
    for line in response_str.strip().split('\n'):
        if line:  # Skip empty lines
            predictions.append(float(line))  # XGBoost returns one float per line
            
    return np.array(predictions)



## Step 5.3: Next we create some sample data for making predictions.
### Note the sample data does not include the Label/Target Column.

In [None]:
# Create sample employee data for prediction
sample_employees = pd.DataFrame({
    'Age': [35, 42, 29, 55, 38],
    'YearsAtCompany': [3, 15, 1, 20, 7],
    'Salary': [85000, 120000, 65000, 130000, 95000],
    'JobSatisfaction': [3, 4, 2, 5, 3],
    'Department': ['Sales', 'Engineering', 'HR', 'Finance', 'Engineering'],
    'OverTime': ['Yes', 'No', 'Yes', 'No', 'No'],
    'WorkLifeBalance': [2, 4, 2, 3, 3],
    'PerformanceRating': [3, 5, 3, 4, 4]
})


## Step 5.4: Now we make the actual prediction
### This cell makes predictions by calling the predict_with_sagemaker method created above.
### This Cell contains TODO 4, please ensure to complete it. Ensure you write code to predict those employees who are at 20% risk of qutting the job.

In [None]:
# Make predictions using SageMaker SDK
sdk_predictions = predict_with_sagemaker_sdk(sample_employees, xgb_predictor)

# TODO 4 - Predict Employees who are at 20% or above at the risk of quitting
sdk_attrition_risk = # YOUR CODE HERE

## Step 5.5: Display the results from making prediction

In [None]:
# Display results
results = pd.DataFrame({
    'Employee': range(1, len(sample_employees) + 1),
    'Age': sample_employees['Age'],
    'Department': sample_employees['Department'],
    'YearsAtCompany': sample_employees['YearsAtCompany'],
    'OverTime': sample_employees['OverTime'],
    'Attrition_Probability': sdk_predictions,
    'Attrition_Risk': ['High' if risk == 1 else 'Low' for risk in sdk_attrition_risk]
})
print("\nEmployee Attrition Prediction Results:")
print(results)

## Step 6: Optional (Clean Up)

In [None]:
# Clean up resources when done
print("\nCleaning up resources...")
xgb_predictor.delete_endpoint()
print("Endpoint deleted successfully")
