In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
IT_DATA = pd.read_csv('Employee_Profile_IT.csv')

In [5]:
DT_FMT = '%m/%d/%Y' 
IT_DATA.columns = IT_DATA.columns.str.replace(r'[^\w\s]', '', regex=True)
IT_DATA.rename(columns={'Bonus ': 'Cur_Bonus'}, inplace=True) 
IT_DATA.rename(columns={'Exit Date': 'Init_Exit'}, inplace=True)
IT_DATA['Annual Salary'] = IT_DATA['Annual Salary'].replace({r'[$,]': ''}, regex=True).astype(float)
IT_DATA['Cur_Bonus'] = IT_DATA['Cur_Bonus'].astype(float) / 100 
IT_DATA.drop('Init_Exit', axis=1, inplace=True)

In [6]:
IT_DATA = IT_DATA[~IT_DATA['EEID'].str.contains('E100', na=False)].reset_index(drop=True)
num_feats = ['Age', 'Annual Salary', 'Cur_Bonus', 'EmploymentRating', 'DaysOfAbsence', 'CertificationsEarned']
for feat in num_feats:
    if IT_DATA[feat].isnull().any(): IT_DATA[feat].fillna(IT_DATA[feat].median(), inplace=True)
for col in IT_DATA.columns:
    if IT_DATA[col].dtype == 'object' and IT_DATA[col].isnull().any(): IT_DATA[col].fillna(IT_DATA[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  if IT_DATA[feat].isnull().any(): IT_DATA[feat].fillna(IT_DATA[feat].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  if IT_DATA[col].dtype == 'object' and IT_DATA[col].isnull().any(): IT_DATA[col].fillna(IT_DATA[col].mode()[0], inplace=True)


In [7]:
EMP_BONUS_DATA = pd.read_csv('Performance_Bonus.csv')

In [8]:
EMP_BONUS_DATA.rename(columns={'Bouns': 'Bonus_Amt'}, inplace=True) #bonus amount
EMP_BONUS_DATA['Bonus_Amt'] = EMP_BONUS_DATA['Bonus_Amt'].str.replace('%','').astype(float) / 100

In [9]:
level_map = {"Bachelor's": 0, "Master's": 1, "Doctorate": 2}
EMP_BONUS_DATA['EL_EN'] = EMP_BONUS_DATA['EducationLevel'].map(level_map)

In [10]:
Q1, Q3 = np.percentile(EMP_BONUS_DATA['Bonus_Amt'], [25, 75])
IQR = Q3 - Q1
Upper_Bound = Q3 + 1.5 * IQR
EMP_BONUS_DATA = EMP_BONUS_DATA[(EMP_BONUS_DATA['Bonus_Amt'] <= Upper_Bound)]

In [11]:
X_COLS = ['EmploymentRating', 'DaysOfAbsence', 'CertificationsEarned', 'EL_EN']
Y_TARG = 'Bonus_Amt'

In [12]:
X_train_bonus = EMP_BONUS_DATA[X_COLS]
Y_train_bonus = EMP_BONUS_DATA[Y_TARG]

In [13]:
X_T, X_V, Y_T, Y_V = train_test_split(X_train_bonus, Y_train_bonus, test_size=0.25, random_state=35)

In [14]:
model_bonus = LinearRegression()
model_bonus.fit(X_T, Y_T)
Y_pred_V = model_bonus.predict(X_V)

In [15]:
mae_bonus = mean_absolute_error(y_true=Y_V, y_pred=Y_pred_V)
mse_bonus = mean_squared_error(y_true=Y_V, y_pred=Y_pred_V)
rmse_bonus = np.sqrt(mse_bonus)

In [16]:
print(f"MAE: {mae_bonus * 100:.2f}%")
print(f"RMSE: {rmse_bonus * 100:.2f}%")
print(f"Training Mean Bonus: {(EMP_BONUS_DATA['Bonus_Amt']).mean() * 100:.2f}%")

MAE: 10.30%
RMSE: 11.74%
Training Mean Bonus: 20.19%


In [17]:
IT_DATA['EL_EN'] = IT_DATA['EducationLevel'].replace({'PhD': 'Doctorate'}).map(level_map)

In [18]:
if IT_DATA['EL_EN'].isnull().any():
    # Fill NaN with the mode of the encoded values
    impute_val = IT_DATA['EL_EN'].mode()[0]
    IT_DATA['EL_EN'].fillna(impute_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  IT_DATA['EL_EN'].fillna(impute_val, inplace=True)


In [19]:
IT_DATA['Predicted Bonus'] = model_bonus.predict(IT_DATA[X_COLS])

In [20]:
IT_DATA.drop(['EL_EN'], axis=1, inplace=True)

In [21]:
print("\n--- Predicted Bonus for IT Employees ---")
print(IT_DATA[['Full Name', 'Cur_Bonus', 'Predicted Bonus']].head().to_string())


--- Predicted Bonus for IT Employees ---
          Full Name  Cur_Bonus  Predicted Bonus
0  Lillian Gonzales     0.0000         0.199100
1  Scarlett Jenkins     0.0032         0.204686
2  Brooklyn Salazar     0.0000         0.206017
3       Riley Rojas     0.0000         0.207836
4    Isabella Scott     0.0000         0.197077


In [22]:
count_under_rewarded = 0
count_zero_bonus = 0

In [23]:
for actual_bonus, predicted_bonus in zip(IT_DATA['Cur_Bonus'], IT_DATA['Predicted Bonus']):
    if actual_bonus == 0.0:
        count_zero_bonus += 1
    
    if predicted_bonus > actual_bonus:
        count_under_rewarded += 1

print(f"Employees who received less actual bonus than predicted: {count_under_rewarded} ({count_under_rewarded / len(IT_DATA) * 100:.2f}%)")
print(f"Employees with zero actual bonus: {count_zero_bonus} ({count_zero_bonus / len(IT_DATA) * 100:.2f}%)")

Employees who received less actual bonus than predicted: 215 (100.00%)
Employees with zero actual bonus: 147 (68.37%)


In [24]:
output_filename = 'DataPredictedBonus.csv'
IT_DATA.to_csv(output_filename, index=False)

In [34]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# 1. Define the model using a new variable
model_ridge_new = Ridge(random_state=35)

# 2. Define the hyperparameter grid to search (alpha is the regularization strength)
param_grid_ridge = {
    'alpha': np.logspace(-4, 0, 20), # Search for alpha between 0.0001 and 1.0
    'fit_intercept': [True, False]
}

# 3. Setup GridSearchCV using a new variable
# Uses the full training data (X_train_bonus, Y_train_bonus) defined in cell 10
grid_search_ridge = GridSearchCV(
    estimator=model_ridge_new, 
    param_grid=param_grid_ridge, 
    scoring='neg_mean_squared_error', 
    cv=5, 
    verbose=0,
    n_jobs=-1
)

# 4. Fit the grid search model to the training data
grid_search_ridge.fit(X_train_bonus, Y_train_bonus)

# 5. Extract the best model and parameters using new variables
best_alpha_ridge = grid_search_ridge.best_params_['alpha']
best_intercept_ridge = grid_search_ridge.best_params_['fit_intercept']
best_model_ridge = grid_search_ridge.best_estimator_

print(f"--- Hyperparameter Tuning Results (Ridge Regression) ---")
print(f"Optimal Alpha (Regularization): {best_alpha_ridge:.4f}")
print(f"Optimal Intercept Use: {best_intercept_ridge}")
print(f"Best cross-validation score (Negative MSE): {grid_search_ridge.best_score_:.4f}")
print(f"Coefficients of Best Ridge Model:")
print(f"  {X_COLS[0]}: {best_model_ridge.coef_[0]:.4f}")
print(f"  {X_COLS[1]}: {best_model_ridge.coef_[1]:.4f}")
print(f"  {X_COLS[2]}: {best_model_ridge.coef_[2]:.4f}")
print(f"  {X_COLS[3]}: {best_model_ridge.coef_[3]:.4f}")
Y_pred_V_ridge = best_model_ridge.predict(X_V)
mae_ridge = mean_absolute_error(y_true=Y_V, y_pred=Y_pred_V_ridge)
mse_ridge = mean_squared_error(y_true=Y_V, y_pred=Y_pred_V_ridge)
rmse_ridge = np.sqrt(mse_ridge)
print(f"--- Ridge Regression Model Performance on Validation Data ---")
print(f"Mean Absolute Error (MAE): {mae_ridge:.4f} (Ratio) / {mae_ridge * 100:.2f}%")
print(f"Root Mean Squared Error (RMSE): {rmse_ridge:.4f} (Ratio) / {rmse_ridge * 100:.2f}%")

--- Hyperparameter Tuning Results (Ridge Regression) ---
Optimal Alpha (Regularization): 1.0000
Optimal Intercept Use: True
Best cross-validation score (Negative MSE): -0.0143
Coefficients of Best Ridge Model:
  EmploymentRating: 0.0012
  DaysOfAbsence: 0.0014
  CertificationsEarned: -0.0002
  EL_EN: 0.0027
--- Ridge Regression Model Performance on Validation Data ---
Mean Absolute Error (MAE): 0.1026 (Ratio) / 10.26%
Root Mean Squared Error (RMSE): 0.1165 (Ratio) / 11.65%


In [38]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
# Define the features to use (using the original categorical column 'EducationLevel')
X_COLS_XGB = ['EmploymentRating', 'DaysOfAbsence', 'CertificationsEarned', 'EducationLevel']
Y_TARG = 'Bonus_Amt' # Target is still the bonus ratio
# Use the full, cleaned data (EMP_BONUS_DATA) for the tuning process
X_train_xgb = EMP_BONUS_DATA[X_COLS_XGB]
Y_train_xgb = EMP_BONUS_DATA[Y_TARG]
# Split the data into train and validation sets for final evaluation
# Use new variables X_T_xgb and X_V_xgb to avoid overwriting original split
X_T_xgb, X_V_xgb, Y_T_xgb, Y_V_xgb = train_test_split(
    X_train_xgb, Y_train_xgb, test_size=0.25, random_state=35 # Using the same random state as original split)
# 1. Define the preprocessing pipeline for XGBoost
NUMERIC_FEATURES = ['EmploymentRating', 'DaysOfAbsence', 'CertificationsEarned']
CATEGORICAL_FEATURES = ['EducationLevel']
preproc_xgb = ColumnTransformer(
    transformers=[
        # Numerical features passed through without scaling
        ('num', 'passthrough', NUMERIC_FEATURES),
        # Categorical features encoded
        ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_FEATURES)
    ],
    remainder='drop' # Drop any other columns not specified)
# 2. Define the full model pipeline (Preprocessing + Regressor)
model_xgb_pipeline = Pipeline(steps=[
    ('preproc', preproc_xgb),
    ('reg', XGBRegressor(
        objective='reg:squarederror',
        random_state=35,
        n_jobs=-1,
        # Set silent=True to avoid printing messages during tuning
        # 'verbose': 0 (or remove completely) is typically used for silencing XGBoost
    ))])
# 3. Define the parameter distribution for Randomized Search
param_dist_xgb = {
    'reg__n_estimators': [100, 300, 500, 700],
    'reg__max_depth': [3, 5, 7],
    'reg__learning_rate': [0.01, 0.05, 0.1],
    'reg__subsample': [0.7, 0.9, 1.0],
    'reg__colsample_bytree': [0.7, 0.9, 1.0]}
# 4. Setup RandomizedSearchCV using the entire training data for internal CV
# Use 30 iterations for a reasonable search time
random_search_xgb = RandomizedSearchCV(
    model_xgb_pipeline,
    param_distributions=param_dist_xgb,
    n_iter=30, # Test 30 random combinations
    scoring='neg_mean_squared_error',
    cv=5, # 5-fold cross-validation
    verbose=0, # Set to 0 to minimize output during search
    random_state=35,
    n_jobs=-1)
# Fit the grid search model (tuning and training the best model)
# Note: X_train_xgb and Y_train_xgb are the full training sets used for tuning.
random_search_xgb.fit(X_train_xgb, Y_train_xgb)
# 5. Extract the best model and parameters
best_xgb_model_tuned = random_search_xgb.best_estimator_
best_xgb_params = random_search_xgb.best_params_
# 6. Predict and evaluate performance on the held-out validation set (X_V_xgb)
Y_pred_V_xgb_tuned = best_xgb_model_tuned.predict(X_V_xgb)
# Ensure predictions are non-negative since bonus ratio cannot be negative
Y_pred_V_xgb_tuned[Y_pred_V_xgb_tuned < 0] = 0
# Calculate MAE and RMSE
mae_xgb_tuned = mean_absolute_error(Y_V_xgb, Y_pred_V_xgb_tuned)
rmse_xgb_tuned = np.sqrt(mean_squared_error(Y_V_xgb, Y_pred_V_xgb_tuned))
# 7. Print the results
print(f"--- Tuned XGBoost Regression Model ---")
print(f"Optimal XGBoost Parameters found: {best_xgb_params}")
print(f"Best cross-validation score (Negative MSE): {random_search_xgb.best_score_:.4f}")
print(f"\n--- Model Performance on Validation Data ---")
print(f"Mean Absolute Error (MAE): {mae_xgb_tuned:.4f} (Ratio) / {mae_xgb_tuned * 100:.2f}%")
print(f"Root Mean Squared Error (RMSE): {rmse_xgb_tuned:.4f} (Ratio) / {rmse_xgb_tuned * 100:.2f}%")

--- Tuned XGBoost Regression Model ---
Optimal XGBoost Parameters found: {'reg__subsample': 0.9, 'reg__n_estimators': 100, 'reg__max_depth': 3, 'reg__learning_rate': 0.01, 'reg__colsample_bytree': 0.9}
Best cross-validation score (Negative MSE): -0.0143

--- Model Performance on Validation Data ---
Mean Absolute Error (MAE): 0.1006 (Ratio) / 10.06%
Root Mean Squared Error (RMSE): 0.1147 (Ratio) / 11.47%


In [39]:
# 1. Define the features required by the tuned XGBoost model's preprocessing pipeline
# These are the original columns used for training the best_xgb_model_tuned:
X_COLS_XGB_FINAL = ['EmploymentRating', 'DaysOfAbsence', 'CertificationsEarned', 'EducationLevel']

# 2. Use the best tuned model (which includes the ColumnTransformer) to predict the bonus ratio
# The best_xgb_model_tuned handles the transformation of 'EducationLevel' internally.
IT_DATA['Predicted Bonus (XGBoost Tuned)'] = best_xgb_model_tuned.predict(IT_DATA[X_COLS_XGB_FINAL])

# 3. Ensure predictions are non-negative since bonus ratio cannot be negative
IT_DATA['Predicted Bonus (XGBoost Tuned)'].clip(lower=0, inplace=True)

# 4. Display the head of the updated DataFrame for verification
print("\n--- IT Data with Tuned XGBoost Predictions ---")
print(IT_DATA[['Full Name', 'Cur_Bonus', 'Predicted Bonus', 'Predicted Bonus (XGBoost Tuned)']].head().to_string())

# 5. Generate the final CSV file
output_filename_xgb = 'DataPredictedBonus_enhanced.csv'
IT_DATA.to_csv(output_filename_xgb, index=False)

print(f"\nFinal output saved to: {output_filename_xgb}")


--- IT Data with Tuned XGBoost Predictions ---
          Full Name  Cur_Bonus  Predicted Bonus  Predicted Bonus (XGBoost Tuned)
0  Lillian Gonzales     0.0000         0.199100                         0.201252
1  Scarlett Jenkins     0.0032         0.204686                         0.209264
2  Brooklyn Salazar     0.0000         0.206017                         0.202194
3       Riley Rojas     0.0000         0.207836                         0.209642
4    Isabella Scott     0.0000         0.197077                         0.205071

Final output saved to: DataPredictedBonus_enhanced.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  IT_DATA['Predicted Bonus (XGBoost Tuned)'].clip(lower=0, inplace=True)
