In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("Extended_Employee_Performance_and_Productivity_Data.csv")

In [3]:
df["Hire_Date"] = pd.to_datetime(df["Hire_Date"])
df["Hire_Year"] = df["Hire_Date"].dt.year
df.drop(["Hire_Date", "Employee_ID", "Resigned"], axis=1, inplace=True)

In [4]:
# -------------------------------
# 3) Create Productivity Score
# -------------------------------
df["Productivity_Raw"] = (
      (0.45 * df["Projects_Handled"]) +
      (0.25 * df["Work_Hours_Per_Week"]) +
      (0.20 * df["Overtime_Hours"]) -
      (0.15 * df["Sick_Days"]) +
      (0.05 * df["Training_Hours"])
)

In [5]:
# Scale to 1–100 range
scaler = MinMaxScaler(feature_range=(1, 100))
df["Productivity_Score"] = scaler.fit_transform(df[["Productivity_Raw"]])
df["Productivity_Score"] = df["Productivity_Score"].round(2)

In [6]:
print("\nSample Productivity Scores:")
print(df[["Productivity_Raw", "Productivity_Score"]].head())


Sample Productivity Scores:
   Productivity_Raw  Productivity_Score
0             30.05               58.50
1             27.35               51.85
2             22.20               39.17
3             21.30               36.96
4             18.75               30.68


In [7]:
# -------------------------------
# 4) Define target and features
# -------------------------------
target = "Productivity_Score"
X = df.drop(columns=["Productivity_Score", "Productivity_Raw"])
y = df[target]

In [8]:
# -------------------------------
# 5) Feature types
# -------------------------------
categorical_features = ["Department", "Gender", "Job_Title", "Education_Level"]

numerical_features = [
    "Age", "Years_At_Company", "Monthly_Salary", "Work_Hours_Per_Week",
    "Projects_Handled", "Overtime_Hours", "Sick_Days",
    "Remote_Work_Frequency", "Team_Size", "Training_Hours",
    "Promotions", "Employee_Satisfaction_Score", "Hire_Year"
]

In [9]:
# -------------------------------
# 6) Preprocessing Pipeline
# -------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

In [10]:
# -------------------------------
# 7) Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
# -------------------------------
# 8) Train Random Forest
# -------------------------------
rf = RandomForestRegressor(n_estimators=200, random_state=42)

rf_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", rf)
])

rf_pipe.fit(X_train, y_train)
rf_pred = rf_pipe.predict(X_test)

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

print("\nRandomForest → RMSE:", rf_rmse, "R2:", rf_r2)



RandomForest → RMSE: 1.0494336421204769 R2: 0.9965295542816264


In [12]:
# -------------------------------
# 9) Train Gradient Boosting
# -------------------------------
gb = GradientBoostingRegressor(random_state=42)

gb_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", gb)
])

gb_pipe.fit(X_train, y_train)
gb_pred = gb_pipe.predict(X_test)

gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
gb_r2 = r2_score(y_test, gb_pred)

print("\nGradientBoosting → RMSE:", gb_rmse, "R2:", gb_r2)


GradientBoosting → RMSE: 0.687056139389835 R2: 0.9985124905257211


In [13]:
# -------------------------------
# 10) Final Comparison
# -------------------------------
print("\n================== FINAL COMPARISON ==================")
print(f"RandomForest     → RMSE: {rf_rmse:.4f}, R2: {rf_r2:.4f}")
print(f"GradientBoosting → RMSE: {gb_rmse:.4f}, R2: {gb_r2:.4f}")

best_model = "GradientBoosting" if gb_r2 > rf_r2 else "RandomForest"
print("\nBest Model for Productivity:", best_model)


RandomForest     → RMSE: 1.0494, R2: 0.9965
GradientBoosting → RMSE: 0.6871, R2: 0.9985

Best Model for Productivity: GradientBoosting
