In [None]:
'''Devin Bailey
January 29, 2024
Emploee Turnover Analytics Project
CB AIML Core: Machine Learning'''

'''Develop a model to predict employee retention using Python.'''


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
import warnings

# Ignore specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load the dataset
data = pd.read_excel('1688640705_hr_comma_sep.xlsx')

# 1. Data Quality Checks
# Check for missing values, outliers, and inconsistent data types
print(data.isnull().sum())
print(data.describe())

# 2. Exploratory Data Analysis (EDA)
# Correlation Heatmap
numeric_data = data.select_dtypes(include=[np.number])
sns.heatmap(numeric_data.corr(), annot=True)
plt.show()

# Distribution Plots
for col in ['satisfaction_level', 'last_evaluation', 'average_montly_hours']:
    sns.histplot(data[col], kde=True)
    plt.show()

# Convert 'left' column to string type for plotting
data['left'] = data['left'].astype(str)

# Bar Plot of Employee Project Count
sns.countplot(x='number_project', hue='left', data=data)
plt.show()

# 3. Clustering of Employees Who Left
# Check if there are employees who left
if (data['left'] == 1).any():
    left_emp = data[data['left'] == 1][['satisfaction_level', 'last_evaluation']]

    # Check if left_emp is not empty
    if not left_emp.empty:
        kmeans = KMeans(n_clusters=3).fit(left_emp)
        left_emp['cluster'] = kmeans.labels_
        sns.scatterplot(x='satisfaction_level', y='last_evaluation', hue='cluster', data=left_emp)
        plt.show()
    else:
        print("No valid data for employees who left.")
else:
    print("No employees left in the dataset.")


# 4. Handle Class Imbalance with SMOTE
X = pd.get_dummies(data.drop('left', axis=1))
y = data['left'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# 5. Model Training with Cross-Validation
kf = KFold(n_splits=5, random_state=0, shuffle=True)

# Logistic Regression
log_reg = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
log_reg_scores = cross_val_score(log_reg, X_train_sm, y_train_sm, cv=kf)
print('LogisticRegression scores',log_reg_scores)

# Random Forest
rf = RandomForestClassifier()
rf_scores = cross_val_score(rf, X_train_sm, y_train_sm, cv=kf)
print('RandomForestClassifier scores', rf_scores)

# Gradient Boosting
gb = GradientBoostingClassifier()
gb_scores = cross_val_score(gb, X_train_sm, y_train_sm, cv=kf)
print('GradientBoostingClassifier scores', gb_scores)

# Choose the best model based on cross-validation scores
def evaluate_model(model, X, y, cv):
    # Determine the model name
    if isinstance(model, Pipeline):
        model_name = model.steps[-1][0]  # Name of the last step in the pipeline
    else:
        model_name = model.__class__.__name__  # Class name of the model
    
    # Predict probabilities and classes
    y_probas = cross_val_predict(model, X, y, cv=cv, method="predict_proba")
    y_preds = cross_val_predict(model, X, y, cv=cv)

    # ROC/AUC
    roc_auc = roc_auc_score(y, y_probas[:, 1])
    print(f'{model_name} - ROC/AUC Score: {roc_auc}')

    # ROC Curve
    fpr, tpr, thresholds = roc_curve(y, y_probas[:, 1])
    # Handle both pipeline and non-pipeline models
    if isinstance(model, Pipeline):
        model_name = model.steps[-1][0]  # Get name of the last step of the pipeline
    else:
        model_name = model.__class__.__name__  # Get class name of the model

    plt.plot(fpr, tpr, label=f'{model_name} (area = {roc_auc:.2f})')
    
    # Confusion Matrix
    cm = confusion_matrix(y, y_preds)
    print(f'{model_name} - Confusion Matrix:\n{cm}')

    # Precision and Recall
    precision = precision_score(y, y_preds)
    recall = recall_score(y, y_preds)
    print(f'{model_name} - Precision: {precision}, Recall: {recall}')

# Logistic Regression
evaluate_model(log_reg, X_train_sm, y_train_sm, kf)

# Random Forest
evaluate_model(rf, X_train_sm, y_train_sm, kf)

# Gradient Boosting
evaluate_model(gb, X_train_sm, y_train_sm, kf)

# Plotting
plt.legend()
plt.title('ROC Curves')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

# 6. Model Evaluation
# Random Forest model has the best cross-validation scores
rf.fit(X_train_sm, y_train_sm)
# Predicting using the Random Forest model
y_pred_rf = rf.predict(X_test)
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]

# Evaluation metrics
print(classification_report(y_test, y_pred_rf))
roc_auc_rf = roc_auc_score(y_test, y_pred_proba_rf)
print('ROC AUC score:', roc_auc_rf)

fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
plt.plot(fpr_rf, tpr_rf, label='ROC curve (area = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.show()


# 7. Retention Strategy Suggestions
# Generate probabilities using the Random Forest model
proba_rf = rf.predict_proba(X_test)[:, 1]

# Categorize probabilities into different risk zones
proba_bins_rf = pd.cut(proba_rf, bins=[0, 0.2, 0.6, 0.9, 1], labels=['Safe', 'Low Risk', 'Medium Risk', 'High Risk'])
print(proba_bins_rf.value_counts())

# Employees in the 'High Risk' category might need immediate attention or intervention



## Employees in the 'High Risk' category might need immediate attention or intervention.
## Employees in the 'Safe Category' might deserve bonuses.
## Employees in the 'Low Risk' and 'Medium Risk' categories might need incentives to increase performance.