In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_absolute_error, mean_squared_error
from datetime import datetime
import re

In [3]:
IT_DATA = pd.read_csv('Employee_Profile_IT.csv')

In [4]:
DT_FMT = '%m/%d/%Y'
IT_DATA.columns = IT_DATA.columns.str.replace(r'[^\w\s]', '', regex=True)
IT_DATA.rename(columns={'Bonus ': 'Cur_Bonus'}, inplace=True) # Cur_Bonus = Current_Bonus_Ratio
IT_DATA.rename(columns={'Exit Date': 'Init_Exit'}, inplace=True) 
IT_DATA['Annual Salary'] = IT_DATA['Annual Salary'].replace({r'[$,]': ''}, regex=True).astype(float)
IT_DATA['Cur_Bonus'] = IT_DATA['Cur_Bonus'].astype(float)
IT_DATA.drop('Init_Exit', axis=1, inplace=True)
IT_DATA = IT_DATA[~IT_DATA['EEID'].str.contains('E100', na=False)].reset_index(drop=True)

In [5]:
num_feats = ['Age', 'Annual Salary', 'Cur_Bonus', 'EmploymentRating', 'DaysOfAbsence', 'CertificationsEarned']
for feat in num_feats:
    if IT_DATA[feat].isnull().any():
        med_val = IT_DATA[feat].median()
        IT_DATA[feat].fillna(med_val, inplace=True)
for col in IT_DATA.columns:
    if IT_DATA[col].dtype == 'object' and IT_DATA[col].isnull().any():
        mode_val = IT_DATA[col].mode()[0]
        IT_DATA[col].fillna(mode_val, inplace=True)

print("Data Cleanup complete. IT dataset ready.")
print("Total IT records:", len(IT_DATA))

Data Cleanup complete. IT dataset ready.
Total IT records: 215


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  IT_DATA[feat].fillna(med_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  IT_DATA[col].fillna(mode_val, inplace=True)


In [6]:
Exit_Data = pd.read_csv('Employee_Exit.csv') 

In [7]:
Exit_Data.rename(columns={'Bonus %': 'Bonus_Train'}, inplace=True) 

In [8]:
Exit_Data['Annual Salary'] = Exit_Data['Annual Salary'].replace({r'[$,]': ''}, regex=True).astype(float)
Exit_Data['Bonus_Train'] = Exit_Data['Bonus_Train'].replace({r'%': ''}, regex=True).astype(float) / 100

In [9]:
Exit_Data['Hire Date'] = pd.to_datetime(Exit_Data['Hire Date'], format=DT_FMT, errors='coerce')
Exit_Data['Exit Date'] = pd.to_datetime(Exit_Data['Exit Date'], format=DT_FMT, errors='coerce')
Exit_Data['Tenure_Days'] = (Exit_Data['Exit Date'] - Exit_Data['Hire Date']).dt.days

In [10]:
Exit_Data.dropna(subset=['Tenure_Days', 'Annual Salary', 'Bonus_Train'], inplace=True)

In [11]:
X_COLS = ['Age', 'Annual Salary', 'Bonus_Train', 'Country', 'Gender', 
          'Job Title', 'Department', 'Business Unit']
Y_TARG = 'Tenure_Days'
X_train = Exit_Data[X_COLS]
Y_train = Exit_Data[Y_TARG]

In [12]:
preproc = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), ['Age', 'Annual Salary', 'Bonus_Train']),
        ('encode', OneHotEncoder(handle_unknown='ignore'), ['Country', 'Gender', 'Job Title', 'Department', 'Business Unit'])
    ]
)

In [13]:
X_T, X_V, Y_T, Y_V = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

In [14]:
model_lin = Pipeline(steps=[
    ('preproc', preproc),
    ('reg', LinearRegression()) ])

In [15]:
model_lin.fit(X_T, Y_T)
print("Model trained: Linear Regression for Tenure.")

Model trained: Linear Regression for Tenure.


In [16]:
preds_V = model_lin.predict(X_V)
preds_V[preds_V < 0] = 0

In [17]:
err_mae = mean_absolute_error(Y_V, preds_V)
err_mse = mean_squared_error(Y_V, preds_V)
err_rmse = np.sqrt(err_mse)

In [18]:
print(f"\nModel Performance:")
print(f"MAE: {err_mae:.2f} days")
print(f"RMSE: {err_rmse:.2f} days")


Model Performance:
MAE: 866.06 days
RMSE: 997.81 days


In [19]:
IT_INPUT = IT_DATA.copy()

In [20]:
IT_INPUT.rename(columns={'Cur_Bonus': 'Bonus_Train'}, inplace=True) 

In [21]:
pred_tenure = model_lin.predict(IT_INPUT[X_COLS])
pred_tenure[pred_tenure < 0] = 0

In [22]:
IT_INPUT['Hire Date'] = pd.to_datetime(IT_INPUT['Hire Date'], format=DT_FMT, errors='coerce')
tenure_td = pd.to_timedelta(pred_tenure, unit='D')
pred_exit_dt = IT_INPUT['Hire Date'] + tenure_td

In [23]:
IT_DATA['Predicted_ExitDate'] = pred_exit_dt.dt.strftime(DT_FMT)

In [24]:
print(f"\nPrediction Complete. Predicted Exit Dates appended to IT_DATA.")
print("--- Sample Predictions ---")
print(IT_DATA[['Full Name', 'Hire Date', 'Predicted_ExitDate']].head().to_string())


Prediction Complete. Predicted Exit Dates appended to IT_DATA.
--- Sample Predictions ---
          Full Name  Hire Date Predicted_ExitDate
0  Lillian Gonzales  3/13/2009         05/10/2014
1  Scarlett Jenkins  11/9/2011         06/21/2016
2  Brooklyn Salazar   3/1/2011         03/10/2016
3       Riley Rojas  1/21/2021         01/16/2026
4    Isabella Scott  4/26/2016         06/01/2021


In [25]:
output_filename = 'DataPredictedExit.csv'
IT_DATA.to_csv(output_filename, index=False)