In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
IT_DATA = pd.read_csv('Employee_Profile_IT.csv')

In [3]:
DT_FMT = '%m/%d/%Y' 
IT_DATA.columns = IT_DATA.columns.str.replace(r'[^\w\s]', '', regex=True)
IT_DATA.rename(columns={'Bonus ': 'Cur_Bonus'}, inplace=True) 
IT_DATA.rename(columns={'Exit Date': 'Init_Exit'}, inplace=True)
IT_DATA['Annual Salary'] = IT_DATA['Annual Salary'].replace({r'[$,]': ''}, regex=True).astype(float)
IT_DATA['Cur_Bonus'] = IT_DATA['Cur_Bonus'].astype(float) / 100 
IT_DATA.drop('Init_Exit', axis=1, inplace=True)

In [4]:
IT_DATA = IT_DATA[~IT_DATA['EEID'].str.contains('E100', na=False)].reset_index(drop=True)
num_feats = ['Age', 'Annual Salary', 'Cur_Bonus', 'EmploymentRating', 'DaysOfAbsence', 'CertificationsEarned']
for feat in num_feats:
    if IT_DATA[feat].isnull().any(): IT_DATA[feat].fillna(IT_DATA[feat].median(), inplace=True)
for col in IT_DATA.columns:
    if IT_DATA[col].dtype == 'object' and IT_DATA[col].isnull().any(): IT_DATA[col].fillna(IT_DATA[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  if IT_DATA[feat].isnull().any(): IT_DATA[feat].fillna(IT_DATA[feat].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  if IT_DATA[col].dtype == 'object' and IT_DATA[col].isnull().any(): IT_DATA[col].fillna(IT_DATA[col].mode()[0], inplace=True)


In [5]:
EMP_BONUS_DATA = pd.read_csv('Performance_Bonus.csv')

In [6]:
EMP_BONUS_DATA.rename(columns={'Bouns': 'Bonus_Amt'}, inplace=True)
EMP_BONUS_DATA['Bonus_Amt'] = EMP_BONUS_DATA['Bonus_Amt'].str.replace('%','').astype(float) / 100

In [7]:
level_map = {"Bachelor's": 0, "Master's": 1, "Doctorate": 2}
EMP_BONUS_DATA['EL_EN'] = EMP_BONUS_DATA['EducationLevel'].map(level_map)

In [8]:
Q1, Q3 = np.percentile(EMP_BONUS_DATA['Bonus_Amt'], [25, 75])
IQR = Q3 - Q1
Upper_Bound = Q3 + 1.5 * IQR
EMP_BONUS_DATA = EMP_BONUS_DATA[(EMP_BONUS_DATA['Bonus_Amt'] <= Upper_Bound)]

In [9]:
X_COLS = ['EmploymentRating', 'DaysOfAbsence', 'CertificationsEarned', 'EL_EN']
Y_TARG = 'Bonus_Amt'

In [10]:
X_train_bonus = EMP_BONUS_DATA[X_COLS]
Y_train_bonus = EMP_BONUS_DATA[Y_TARG]

In [13]:
X_T, X_V, Y_T, Y_V = train_test_split(X_train_bonus, Y_train_bonus, test_size=0.25, random_state=35)

In [14]:
model_bonus = LinearRegression()
model_bonus.fit(X_T, Y_T)
Y_pred_V = model_bonus.predict(X_V)

In [15]:
mae_bonus = mean_absolute_error(y_true=Y_V, y_pred=Y_pred_V)
mse_bonus = mean_squared_error(y_true=Y_V, y_pred=Y_pred_V)
rmse_bonus = np.sqrt(mse_bonus)

In [16]:
print(f"MAE: {mae_bonus:.4f} (Ratio)")
print(f"RMSE: {rmse_bonus:.4f} (Ratio)")
print(f"Training Mean Bonus: {(EMP_BONUS_DATA['Bonus_Amt']).mean():.4f} (Ratio)")

MAE: 0.1030 (Ratio)
RMSE: 0.1174 (Ratio)
Training Mean Bonus: 0.2019 (Ratio)


In [17]:
IT_DATA['EL_EN'] = IT_DATA['EducationLevel'].replace({'PhD': 'Doctorate'}).map(level_map)

In [18]:
if IT_DATA['EL_EN'].isnull().any():
    # Fill NaN with the mode of the encoded values
    impute_val = IT_DATA['EL_EN'].mode()[0]
    IT_DATA['EL_EN'].fillna(impute_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  IT_DATA['EL_EN'].fillna(impute_val, inplace=True)


In [19]:
IT_DATA['Predicted Bonus'] = model_bonus.predict(IT_DATA[X_COLS])

In [20]:
IT_DATA.drop(['EL_EN'], axis=1, inplace=True)

In [21]:
print("\n--- Predicted Bonus for IT Employees ---")
print(IT_DATA[['Full Name', 'Cur_Bonus', 'Predicted Bonus']].head().to_string())


--- Predicted Bonus for IT Employees ---
          Full Name  Cur_Bonus  Predicted Bonus
0  Lillian Gonzales     0.0000         0.199100
1  Scarlett Jenkins     0.0032         0.204686
2  Brooklyn Salazar     0.0000         0.206017
3       Riley Rojas     0.0000         0.207836
4    Isabella Scott     0.0000         0.197077


In [22]:
count_under_rewarded = 0
count_zero_bonus = 0

In [23]:
for actual_bonus, predicted_bonus in zip(IT_DATA['Cur_Bonus'], IT_DATA['Predicted Bonus']):
    if actual_bonus == 0.0:
        count_zero_bonus += 1
    
    if predicted_bonus > actual_bonus:
        count_under_rewarded += 1

print(f"\n--- Final Comparison: Actual vs. Predicted ---")
print(f"Employees who received less actual bonus than predicted: {count_under_rewarded} ({count_under_rewarded / len(IT_DATA) * 100:.2f}%)")
print(f"Employees with zero actual bonus: {count_zero_bonus} ({count_zero_bonus / len(IT_DATA) * 100:.2f}%)")


--- Final Comparison: Actual vs. Predicted ---
Employees who received less actual bonus than predicted: 215 (100.00%)
Employees with zero actual bonus: 147 (68.37%)


In [24]:
output_filename = 'DataPredictedBonus.csv'
IT_DATA.to_csv(output_filename, index=False)